Sfarzi commited on
Commit
800b2b6
·
1 Parent(s): 2e9e52c

Initial clone with modifications

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Makefile +13 -0
  2. app.py +827 -0
  3. e3c_llm_requests/.gitattributes +59 -0
  4. e3c_llm_requests/Henrychur/MMed-Llama-3-8B.json +8 -0
  5. e3c_llm_requests/HiTZ/Medical-mT5-large.json +8 -0
  6. e3c_llm_requests/Qwen/Qwen2.5-14B-Instruct-1M.json +8 -0
  7. e3c_llm_requests/Qwen/Qwen2.5-32B-Instruct.json +8 -0
  8. e3c_llm_requests/Qwen/Qwen3-30B-A3B-Instruct-2507.json +8 -0
  9. e3c_llm_requests/deepseek-ai/.ipynb_checkpoints/DeepSeek-R1-Distill-Qwen-32B-checkpoint.json +8 -0
  10. e3c_llm_requests/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B.json +8 -0
  11. e3c_llm_requests/epfl-llm/meditron-7b.json +8 -0
  12. e3c_llm_requests/google/gemma-2-9b-it.json +8 -0
  13. e3c_llm_requests/google/gemma-3-27b-it.json +8 -0
  14. e3c_llm_requests/google/medgemma-27b-text-it.json +8 -0
  15. e3c_llm_requests/google/medgemma-4b-it.json +8 -0
  16. e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json +8 -0
  17. e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json +8 -0
  18. e3c_llm_requests/microsoft/MediPhi-Clinical.json +8 -0
  19. e3c_llm_requests/microsoft/MediPhi-Instruct.json +8 -0
  20. e3c_llm_requests/mistralai/Mistral-7B-Instruct-v0.2.json +8 -0
  21. e3c_llm_requests/mistralai/Mistral-Nemo-Instruct-2407.json +8 -0
  22. e3c_llm_requests/tiiuae/Falcon3-10B-Instruct.json +8 -0
  23. e3c_llm_requests/unsloth/phi-4.json +8 -0
  24. e3c_llm_results/.gitattributes +59 -0
  25. e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_EN.json +69 -0
  26. e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_GR.json +69 -0
  27. e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_IT.json +69 -0
  28. e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_PL.json +69 -0
  29. e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SK.json +69 -0
  30. e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SL.json +69 -0
  31. e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_EN.json +69 -0
  32. e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_GR.json +69 -0
  33. e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_IT.json +69 -0
  34. e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_PL.json +69 -0
  35. e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SK.json +69 -0
  36. e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SL.json +69 -0
  37. e3c_llm_results/HiTZ/Medical-mT5-large_0_EN.json +69 -0
  38. e3c_llm_results/HiTZ/Medical-mT5-large_0_GR.json +69 -0
  39. e3c_llm_results/HiTZ/Medical-mT5-large_0_IT.json +63 -0
  40. e3c_llm_results/HiTZ/Medical-mT5-large_0_PL.json +69 -0
  41. e3c_llm_results/HiTZ/Medical-mT5-large_0_SK.json +69 -0
  42. e3c_llm_results/HiTZ/Medical-mT5-large_0_SL.json +69 -0
  43. e3c_llm_results/HiTZ/Medical-mT5-large_10_EN.json +69 -0
  44. e3c_llm_results/HiTZ/Medical-mT5-large_10_GR.json +69 -0
  45. e3c_llm_results/HiTZ/Medical-mT5-large_10_IT.json +69 -0
  46. e3c_llm_results/HiTZ/Medical-mT5-large_10_PL.json +69 -0
  47. e3c_llm_results/HiTZ/Medical-mT5-large_10_SK.json +69 -0
  48. e3c_llm_results/HiTZ/Medical-mT5-large_10_SL.json +69 -0
  49. e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_EN.json +69 -0
  50. e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_GR.json +69 -0
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
app.py ADDED
@@ -0,0 +1,827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
7
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
8
+ from src.display.css_html_js import custom_css
9
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
10
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
+ from src.submission.submit import add_new_eval
13
+ import random
14
+ import matplotlib.pyplot as plt
15
+ import re
16
+ import plotly.express as px
17
+ import plotly.graph_objects as go
18
+ import numpy as np
19
+
20
+
21
+ def mean_of_max_per_field(df):
22
+ """
23
+ Calcola il massimo per ciascun campo e poi la media dei massimi.
24
+
25
+ Args:
26
+ df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL
27
+
28
+ Returns:
29
+ float: media dei valori massimi dei campi
30
+ """
31
+ #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
32
+ fields = ["NER", "REL"]
33
+ #print(df.columns)
34
+
35
+ # Controlla che tutte le colonne esistano nel DataFrame
36
+ missing = [f for f in fields if f not in df.columns]
37
+ if missing:
38
+ raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
39
+
40
+ # Calcola il massimo per ciascun campo
41
+ max_values = df[fields].max()
42
+
43
+ # Calcola la media dei massimi
44
+ mean_max = max_values.mean()
45
+
46
+ return mean_max
47
+
48
+
49
+ def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
50
+ if tasks is None:
51
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
52
+
53
+ task_means = {}
54
+
55
+ for task in tasks:
56
+ if task not in dataframe.columns:
57
+ continue
58
+
59
+ # Separa few-shot e zero-shot
60
+ few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]]
61
+ zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]]
62
+
63
+ # Allinea i modelli
64
+ merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero"))
65
+
66
+ # Rimuovi righe con valori mancanti
67
+ merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"])
68
+
69
+ if merged.empty:
70
+ continue
71
+
72
+ # Calcola differenza few - zero
73
+ diff = merged[f"{task}_few"] - merged[f"{task}_zero"]
74
+
75
+ # Calcola la media
76
+ task_means[task] = diff.mean()
77
+
78
+ # Crea barplot
79
+ fig = go.Figure([go.Bar(
80
+ x=list(task_means.keys()),
81
+ y=list(task_means.values()),
82
+ marker_color="#ff7f0e",
83
+ text=[f"{v:.2f}" for v in task_means.values()],
84
+ textposition="outside",
85
+ hovertemplate="<b>%{x}</b><br>Mean Delta Accuracy: %{y:.2f}%<extra></extra>"
86
+ )])
87
+
88
+ # Linea di riferimento a 0
89
+ '''
90
+ fig.add_shape(
91
+ type="line",
92
+ x0=-0.5, x1=len(task_means) - 0.5,
93
+ y0=0, y1=0,
94
+ line=dict(color="black", width=2, dash="dash"),
95
+ xref="x", yref="y"
96
+ )
97
+ '''
98
+
99
+ fig.update_layout(
100
+ title="Mean Accuracy Difference (Few-shot − Zero-shot) per Task",
101
+ xaxis_title="",
102
+ yaxis_title="Mean Delta Combined Performance",
103
+ template="plotly_white",
104
+ font=dict(family="Arial", size=13),
105
+ #margin=dict(b=100)
106
+ )
107
+
108
+ fig.add_annotation(
109
+ text="10-shot learning generally outperforms zero-shot. <br>"
110
+ "",
111
+ xref="paper", yref="paper",
112
+ x=0, y=-0.2,
113
+ showarrow=False,
114
+ font=dict(size=11, color="gray"),
115
+ align="left"
116
+ )
117
+
118
+ return fig
119
+
120
+
121
+ def boxplot_per_task(dataframe=None, baselines=None, references=None):
122
+
123
+ #print(dataframe.columns)
124
+
125
+ #tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
126
+ tasks =["NER", "REL"]
127
+ if dataframe is None:
128
+ np.random.seed(42)
129
+ dataframe = pd.DataFrame({
130
+ task: np.random.uniform(0.4, 0.9, 20) * 100
131
+ for task in tasks
132
+ })
133
+
134
+ if baselines is None:
135
+ baselines = {task: np.random.randint(50, 70) for task in tasks}
136
+
137
+ colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
138
+ "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
139
+
140
+ fig = go.Figure()
141
+
142
+ for i, task in enumerate(tasks):
143
+ if task in dataframe.columns:
144
+ y_data = dataframe[task].dropna().tolist()
145
+
146
+ # boxplot
147
+ fig.add_trace(go.Box(
148
+ y=y_data,
149
+ name=task,
150
+ marker=dict(color=colors[i]),
151
+ line=dict(color="black", width=2),
152
+ fillcolor=colors[i],
153
+ opacity=0.7,
154
+ hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
155
+ width=0.6,
156
+ whiskerwidth=0.2,
157
+ quartilemethod="linear"
158
+ ))
159
+
160
+ # baseline
161
+ if task in baselines and baselines[task] is not None:
162
+ fig.add_shape(
163
+ type="line",
164
+ x0=i - 0.3, x1=i + 0.3,
165
+ y0=baselines[task], y1=baselines[task],
166
+ line=dict(color="black", width=2, dash="dot"), # più visibile
167
+ xref="x", yref="y"
168
+ )
169
+ '''
170
+ fig.add_annotation(
171
+ x=i, y=baselines[task],
172
+ text=f"{baselines[task]}%",
173
+ showarrow=False,
174
+ yshift=10,
175
+ font=dict(size=10, color="black")
176
+ )
177
+ '''
178
+
179
+ # reference GPT-4o
180
+ if task in references and references[task] is not None:
181
+ fig.add_shape(
182
+ type="line",
183
+ x0=i - 0.3, x1=i + 0.3,
184
+ y0=references[task], y1=references[task],
185
+ line=dict(color="red", width=2, dash="dashdot"),
186
+ xref="x", yref="y"
187
+ )
188
+
189
+ fig.update_layout(
190
+ title="Distribution of Model Accuracy by Task",
191
+ xaxis_title="Task",
192
+ yaxis_title="Combined Performance",
193
+ template="plotly_white",
194
+ boxmode="group",
195
+ dragmode=False,
196
+ font=dict(family="Arial", size=10),
197
+ margin=dict(b=80),
198
+ )
199
+
200
+ fig.add_annotation(
201
+ text=(""
202
+ #"In tasks like TE and SA, models approach the accuracy of supervised <br>"
203
+ #"models at EVALITA (dashed black line); in NER and REL they remain lower. <br>"
204
+ # "Dashed red lines show GPT-4o reference results for generative tasks."
205
+ ),
206
+ xref="paper", yref="paper",
207
+ x=0.5, y=-0.30,
208
+ showarrow=False,
209
+ font=dict(size=11, color="gray"),
210
+ align="left"
211
+ )
212
+
213
+ fig.update_yaxes(range=[0, 100], fixedrange=True)
214
+
215
+ return fig
216
+
217
+ # EVALITA results
218
+ BASELINES = {
219
+ "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
220
+ "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
221
+ }
222
+
223
+ # GPT-4o
224
+ REFERENCES = {
225
+ "NER": 79.11,
226
+ "REL": 63.32,
227
+ "LS": 59.25,
228
+ "SU": 33.04
229
+
230
+ }
231
+
232
+
233
+ def boxplot_prompts_per_task(dataframe, tasks=None):
234
+ if tasks is None:
235
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
236
+
237
+ # Lista delle colonne da aggiornare
238
+ cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
239
+ # Applichiamo la trasformazione
240
+ for col in cols_to_update:
241
+ dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
242
+
243
+ fig = go.Figure()
244
+
245
+ # Liste per creare una sola voce in legenda per Average e Best
246
+ avg_x, avg_y = [], []
247
+ best_x, best_y, best_text = [], [], []
248
+
249
+ for task in tasks:
250
+ avg_col = f"{task} Prompt Average"
251
+ best_col = f"{task} Best Prompt"
252
+ best_id_col = f"{task} Best Prompt Id"
253
+
254
+ if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]):
255
+ avg_value = dataframe[avg_col].mean()
256
+ avg_x.append(task)
257
+ avg_y.append(avg_value)
258
+
259
+ best_value = dataframe[best_col].mean()
260
+ best_x.append(task)
261
+ best_y.append(best_value)
262
+ best_id = dataframe[best_id_col].mode()[0] # Most frequent best prompt id
263
+ best_text.append(f"P:{best_id}")
264
+
265
+ # Barre Average Accuracy (azzurro)
266
+ fig.add_trace(go.Bar(
267
+ x=avg_x,
268
+ y=avg_y,
269
+ name="Avg. Accuracy",
270
+ marker_color="#1f77b4",
271
+ #hovertemplate="%{y:.2f}%<extra></extra>"
272
+ #hovertemplate="<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
273
+ ))
274
+
275
+ # Barre Best Prompt (rosso)
276
+ fig.add_trace(go.Bar(
277
+ x=best_x,
278
+ y=best_y,
279
+ name="Best Prompt",
280
+ marker_color="#d62728",
281
+ #hovertemplate="%{y:.2f}%<extra></extra>"
282
+ #hovertemplate = "<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
283
+ ))
284
+
285
+ # Testo sopra barre Best Prompt con ID
286
+ for x, y, text in zip(best_x, best_y, best_text):
287
+ fig.add_annotation(
288
+ x=x,
289
+ y=y + 3, # leggermente sopra la barra
290
+ text=text,
291
+ showarrow=False,
292
+ font=dict(size=12, color="black")
293
+ )
294
+
295
+ fig.update_layout(
296
+ title= "Prompt Accuracy: Avg vs Best",
297
+ xaxis_title="Task",
298
+ yaxis_title="Combined Performance",
299
+ barmode='group',
300
+ template="plotly_white",
301
+ font=dict(family="Arial", size=10),
302
+ yaxis=dict(range=[0, 100], fixedrange=True)
303
+ )
304
+
305
+ # caption come annotazione separata
306
+ fig.add_annotation(
307
+ text="There is no single prompt that performs best across all tasks.<br>"
308
+ "Different prompts achieve the highest accuracy on different tasks.",
309
+ xref="paper", yref="paper",
310
+ x=0.5, y=-0.3,
311
+ showarrow=False,
312
+ font=dict(size=11, color="gray"),
313
+ align="center",
314
+ xanchor="center"
315
+ )
316
+
317
+ return fig
318
+
319
+
320
+ def line_chart(dataframe):
321
+
322
+ # Normalizza le dimensioni per avere marker non troppo piccoli né enormi
323
+ def scale_sizes(values, min_size=8, max_size=30):
324
+ vmin, vmax = min(values), max(values)
325
+ return [
326
+ min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2
327
+ for val in values
328
+ ]
329
+
330
+ # dati in base a IS_FS
331
+ df_true = dataframe[dataframe['IS_FS'] == True]
332
+ df_false = dataframe[dataframe['IS_FS'] == False]
333
+
334
+ # Estrai valori x, y e labels
335
+ x_true = df_true['#Params (B)'].tolist()
336
+ y_true = df_true['Avg. Comb. Perf. ⬆️'].tolist()
337
+ labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()]
338
+
339
+ x_false = df_false['#Params (B)'].tolist()
340
+ y_false = df_false['Avg. Comb. Perf. ⬆️'].tolist()
341
+ labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()]
342
+
343
+ fig = go.Figure()
344
+
345
+ # Punti IS_FS=True
346
+ fig.add_trace(go.Scatter(
347
+ x=x_true,
348
+ y=y_true,
349
+ mode='markers',
350
+ name='5-Shot',
351
+ marker=dict(
352
+ color='blue',
353
+ size=scale_sizes(x_true)
354
+ ),
355
+ hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
356
+ customdata=labels_true
357
+ ))
358
+
359
+ # Punti IS_FS=False
360
+ fig.add_trace(go.Scatter(
361
+ x=x_false,
362
+ y=y_false,
363
+ mode='markers',
364
+ name='0-Shot',
365
+ marker=dict(
366
+ color='red',
367
+ size=scale_sizes(x_false)
368
+ ),
369
+ hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
370
+ customdata=labels_false
371
+ ))
372
+
373
+ # Trova il massimo tra tutti i modelli
374
+ all_y = y_true + y_false
375
+ all_x = x_true + x_false
376
+ all_labels = labels_true + labels_false
377
+ max_idx = all_y.index(max(all_y))
378
+ max_x = all_x[max_idx]
379
+ max_y = all_y[max_idx]
380
+ max_label = all_labels[max_idx]
381
+
382
+ # Aggiungi annotazione visibile per il modello migliore
383
+ fig.add_annotation(
384
+ x=max_x,
385
+ y=max_y,
386
+ #text=f"Top: {max_label} ({max_y:.1f}%)",
387
+ text=f"{max_label}",
388
+ showarrow=True,
389
+ arrowhead=2,
390
+ arrowsize=1,
391
+ arrowwidth=2,
392
+ arrowcolor="black",
393
+ font=dict(size=11, color="black"),
394
+ xshift=10,
395
+ yshift=10,
396
+ ax = -30, ay = -20, # sposta la label a sinistra e sopra il punto
397
+ xanchor = "right" # allinea la label a destra rispetto al punto
398
+ )
399
+
400
+ fig.update_layout(
401
+ title="Avg. Combined Performance vs #Params",
402
+ xaxis_title="#Params (B)",
403
+ yaxis_title="Avg. Combined Performance",
404
+ template="plotly_white",
405
+ hovermode="closest",
406
+ font=dict(family="Arial", size=10),
407
+ dragmode=False,
408
+ xaxis=dict(
409
+ tickvals=[0, 25, 50, 75, 100, 125],
410
+ ticktext=["0", "25", "50", "75", "100"]
411
+ ),
412
+ yaxis=dict(
413
+ tickvals=[0, 20, 40, 60, 80, 100], # 👈 tick fissi
414
+ range=[0, 100] # 👈 range bloccato
415
+ )
416
+ )
417
+
418
+ # Caption
419
+ fig.add_annotation(
420
+ text="Accuracy generally rises with #Params, but smaller models <br>"
421
+ "with 5-shot can outperform larger zero-shot models.",
422
+ xref="paper", yref="paper",
423
+ x=0.5, y=-0.3, # 👈 centrata
424
+ showarrow=False,
425
+ font=dict(size=11, color="gray"),
426
+ align="center",
427
+ xanchor="center" # 👈 ancora centrata rispetto al testo
428
+ )
429
+
430
+ fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
431
+ fig.update_yaxes(fixedrange=True)
432
+
433
+ return fig
434
+
435
+
436
+ # Define task metadata (icons, names, descriptions)
437
+ TASK_METADATA_MULTIPLECHOICE = {
438
+ #"TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
439
+ #"SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""},
440
+ #"HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
441
+ #"AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""},
442
+ #"WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""},
443
+ #"FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
444
+ }
445
+
446
+ # Define task metadata (icons, names, descriptions)
447
+ TASK_METADATA_GENERATIVE = {
448
+ #"LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
449
+ #"SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
450
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
451
+ "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
452
+ }
453
+
454
+ def restart_space():
455
+ """Restart the Hugging Face space."""
456
+ API.restart_space(repo_id=REPO_ID)
457
+
458
+
459
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
460
+ """
461
+ Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected.
462
+ The table is sorted based on the "Avg. Combined Performance" field.
463
+ """
464
+ if dataframe is None or dataframe.empty:
465
+ raise ValueError("Leaderboard DataFrame is empty or None.")
466
+
467
+ #print("????????????????????????????????", mean_of_max_per_field(dataframe))
468
+
469
+ sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False)
470
+
471
+ sorted_dataframe = sorted_dataframe.reset_index(drop=True)
472
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
473
+
474
+ # Flag per sapere se la medaglia è già stata assegnata per categoria e tipo
475
+ large_medal_fs_assigned = False
476
+ medium_medal_fs_assigned = False
477
+ small_medal_fs_assigned = False
478
+
479
+ large_medal_0shot_assigned = False
480
+ medium_medal_0shot_assigned = False
481
+ small_medal_0shot_assigned = False
482
+
483
+ # Lista temporanea per salvare i nuovi valori della colonna Model
484
+ new_model_column = []
485
+
486
+ for _, row in sorted_dataframe.iterrows():
487
+ if row['IS_FS']: # 10-Few-Shot
488
+ if row["Size"] == "🔵🔵🔵" and not large_medal_fs_assigned:
489
+ new_model_column.append(f"{row['Model']} 🔵🔵🔵🏆")
490
+ large_medal_fs_assigned = True
491
+ elif row["Size"] == "🔵🔵" and not medium_medal_fs_assigned:
492
+ new_model_column.append(f"{row['Model']} 🔵🔵🏆")
493
+ medium_medal_fs_assigned = True
494
+ elif row["Size"] == "🔵" and not small_medal_fs_assigned:
495
+ new_model_column.append(f"{row['Model']} 🔵🏆")
496
+ small_medal_fs_assigned = True
497
+ else:
498
+ new_model_column.append(row["Model"])
499
+ else: # 0-Shot
500
+ if row["Size"] == "🔵🔵🔵" and not large_medal_0shot_assigned:
501
+ new_model_column.append(f"{row['Model']} 🔵🔵🔵🎖️")
502
+ large_medal_0shot_assigned = True
503
+ elif row["Size"] == "🔵🔵" and not medium_medal_0shot_assigned:
504
+ new_model_column.append(f"{row['Model']} 🔵🔵🎖️")
505
+ medium_medal_0shot_assigned = True
506
+ elif row["Size"] == "🔵" and not small_medal_0shot_assigned:
507
+ new_model_column.append(f"{row['Model']} 🔵🎖️")
508
+ small_medal_0shot_assigned = True
509
+ else:
510
+ new_model_column.append(row["Model"])
511
+
512
+ # Lista delle colonne da aggiornare
513
+ #cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
514
+ # Applichiamo la trasformazione
515
+ #for col in cols_to_update:
516
+ # dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
517
+
518
+ # Aggiorna la colonna Model
519
+ sorted_dataframe["Model"] = new_model_column
520
+
521
+ field_list = fields(AutoEvalColumn)
522
+
523
+ return Leaderboard(
524
+ value=sorted_dataframe,
525
+ datatype=[c.type for c in field_list],
526
+ #select_columns=SelectColumns(
527
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
528
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
529
+ # label="Select Columns to Display:",
530
+ #),
531
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
532
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
533
+ filter_columns=[
534
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
535
+ #ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)",
536
+ # default=[["0️⃣", "0️⃣"]]),
537
+ ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "),
538
+
539
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
540
+ ],
541
+ #filter_columns=[
542
+ # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
543
+ # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot")
544
+ #],
545
+ bool_checkboxgroup_label="Evaluation Mode",
546
+ interactive=False,
547
+ )
548
+
549
+ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
550
+ """
551
+ Update and return the leaderboard when a specific task is selected.
552
+ The table is sorted based on the "Combined Performance" field.
553
+ """
554
+ if dataframe is None or dataframe.empty:
555
+ raise ValueError("Leaderboard DataFrame is empty or None.")
556
+
557
+ sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
558
+
559
+ # aggiungo la colonna rank in base alla posizione
560
+ sorted_dataframe = sorted_dataframe.reset_index(drop=True)
561
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
562
+
563
+ # Flag per sapere se la medaglia è già stata assegnata per categoria e tipo
564
+ large_medal_fs_assigned = False
565
+ medium_medal_fs_assigned = False
566
+ small_medal_fs_assigned = False
567
+
568
+ large_medal_0shot_assigned = False
569
+ medium_medal_0shot_assigned = False
570
+ small_medal_0shot_assigned = False
571
+
572
+ # Lista temporanea per salvare i nuovi valori della colonna Model
573
+ new_model_column = []
574
+
575
+ for _, row in sorted_dataframe.iterrows():
576
+ if row['IS_FS']: # 5-Few-Shot
577
+ if row["Size"] == "🔵🔵🔵" and not large_medal_fs_assigned:
578
+ new_model_column.append(f"{row['Model']} 🔵🔵🔵🏆")
579
+ large_medal_fs_assigned = True
580
+ elif row["Size"] == "🔵🔵" and not medium_medal_fs_assigned:
581
+ new_model_column.append(f"{row['Model']} 🔵🔵🏆")
582
+ medium_medal_fs_assigned = True
583
+ elif row["Size"] == "🔵" and not small_medal_fs_assigned:
584
+ new_model_column.append(f"{row['Model']} 🔵🏆")
585
+ small_medal_fs_assigned = True
586
+ else:
587
+ new_model_column.append(row["Model"])
588
+ else: # 0-Shot
589
+ if row["Size"] == "🔵🔵🔵" and not large_medal_0shot_assigned:
590
+ new_model_column.append(f"{row['Model']} 🔵🔵🔵🎖️")
591
+ large_medal_0shot_assigned = True
592
+ elif row["Size"] == "🔵🔵" and not medium_medal_0shot_assigned:
593
+ new_model_column.append(f"{row['Model']} 🔵🔵🎖️")
594
+ medium_medal_0shot_assigned = True
595
+ elif row["Size"] == "🔵" and not small_medal_0shot_assigned:
596
+ new_model_column.append(f"{row['Model']} 🔵🎖️")
597
+ small_medal_0shot_assigned = True
598
+ else:
599
+ new_model_column.append(row["Model"])
600
+
601
+ # Aggiorna la colonna Model
602
+ sorted_dataframe["Model"] = new_model_column
603
+
604
+ pd.set_option('display.max_colwidth', None)
605
+ #print("========================", dataframe['Model'])
606
+
607
+ #print(sorted_dataframe['Combined Performance'])
608
+
609
+ field_list = fields(AutoEvalColumn)
610
+
611
+ return Leaderboard(
612
+ value=sorted_dataframe,
613
+ #datatype=[c.type for c in field_list],
614
+ datatype=[c.type for c in field_list] + [int],
615
+ #select_columns=SelectColumns(
616
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
617
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
618
+ # label="Select Columns to Display:",
619
+ #),
620
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
621
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
622
+ filter_columns=[
623
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
624
+ ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "),
625
+
626
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
627
+ label="Select the number of parameters (B)"),
628
+ ],
629
+ bool_checkboxgroup_label="Evaluation Mode",
630
+ interactive=False
631
+ )
632
+
633
+ '''
634
+ # Helper function for leaderboard initialization
635
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
636
+ """Initialize and return a leaderboard."""
637
+ if dataframe is None or dataframe.empty:
638
+ raise ValueError("Leaderboard DataFrame is empty or None.")
639
+
640
+ return Leaderboard(
641
+ value=dataframe,
642
+ datatype=[c.type for c in fields(AutoEvalColumn)],
643
+ select_columns=SelectColumns(
644
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
645
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
646
+ label="Select Columns to Display:",
647
+ ),
648
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
649
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
650
+ filter_columns=[
651
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
652
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
653
+ ],
654
+ bool_checkboxgroup_label="Hide models",
655
+ interactive=False,
656
+ )
657
+ '''
658
+
659
+ def download_snapshot(repo, local_dir):
660
+ """Try to download a snapshot from Hugging Face Hub."""
661
+ try:
662
+ print(f"Downloading from {repo} to {local_dir}...")
663
+ snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
664
+ except Exception as e:
665
+ print(f"Error downloading {repo}: {e}")
666
+ restart_space()
667
+
668
+
669
+ # Initialize the app by downloading snapshots
670
+ download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
671
+ download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
672
+
673
+ # Load leaderboard data
674
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
675
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
676
+ #print(LEADERBOARD_DF.columns.tolist())
677
+
678
+ theoretical_max_combined_perf = mean_of_max_per_field(LEADERBOARD_DF)
679
+
680
+ # Prepare the main interface
681
+ demo = gr.Blocks(css=custom_css)
682
+ with demo:
683
+ #gr.HTML(TITLE)
684
+ gr.HTML(
685
+ """
686
+ <div style="display: flex; align-items: center; position: relative; width: 100%; height: 60px; padding: 10px 0;">
687
+ <h1 style="
688
+ margin: 0 auto;
689
+ font-weight: 900;
690
+ font-size: 2.5em;
691
+ letter-spacing: 2px;
692
+ text-transform: uppercase;
693
+ background: linear-gradient(90deg, #1f77b4, #00c6ff);
694
+ -webkit-background-clip: text;
695
+ -webkit-text-fill-color: transparent;
696
+ text-shadow: 2px 2px 8px rgba(0,0,0,0.2);
697
+ ">
698
+ EVALITA-LLM Leaderboard
699
+ </h1>
700
+ <a href="https://huggingface.co/spaces/mii-llm/open_ita_llm_leaderboard" target="_blank"
701
+ style="position: absolute; right: 0; display: inline-flex; align-items: center; gap: 6px; text-decoration: none; color: #1f77b4; font-weight: 600;">
702
+ <!-- Icona stilizzata -->
703
+ <svg xmlns="http://www.w3.org/2000/svg" width="22" height="22" fill="#1f77b4" viewBox="0 0 24 24">
704
+ <path d="M3.9 12a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42a3 3 0 1 0 4.24 4.24l3.54-3.54a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42z"/>
705
+ <path d="M20.1 12a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42a3 3 0 1 0-4.24-4.24l-3.54 3.54a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42z"/>
706
+ </svg>
707
+ Open Italian LLM Leaderboard
708
+ </a>
709
+ </div>
710
+ """
711
+ )
712
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
713
+
714
+ # ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs
715
+ with gr.Row():
716
+ gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
717
+ gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
718
+ #gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task")
719
+
720
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
721
+
722
+ # Main leaderboard tab
723
+ with gr.TabItem("🏅 Benchmark"):
724
+
725
+ leaderboard = init_leaderboard(
726
+ LEADERBOARD_DF,
727
+ default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
728
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
729
+ )
730
+
731
+ # gr.HTML(
732
+ # f"""
733
+ # <div style="
734
+ # border: 2px solid #1f77b4;
735
+ # border-radius: 10px;
736
+ # padding: 10px;
737
+ # background-color: #f0f8ff;
738
+ # font-weight: bold;
739
+ # font-size: 14px;
740
+ # display: inline-block;
741
+ # ">
742
+ # Theoretical performance of a model that scores the highest on every individual task: <span style="color:#d62728; font-size:18px;">{theoretical_max_combined_perf:.2f}</span>
743
+ # </div>
744
+ # $ """
745
+ # )
746
+
747
+ '''
748
+ with gr.TabItem("📈 Charts"):
749
+ #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
750
+ #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
751
+ gr.Plot(value=line_chart(LEADERBOARD_DF))
752
+ gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
753
+ gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
754
+ gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF))
755
+ '''
756
+
757
+ # About tab
758
+ with gr.TabItem("📝 About"):
759
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
760
+
761
+ # About tab
762
+ with gr.TabItem("║", interactive=False):
763
+ gr.Markdown("", elem_classes="markdown-text")
764
+
765
+
766
+ # Task-specific leaderboards
767
+ for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
768
+
769
+ with gr.TabItem(f"{metadata['icon']}{task}"):
770
+
771
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
772
+ gr.Markdown(task_description, elem_classes="markdown-text")
773
+
774
+ leaderboard = update_task_leaderboard(
775
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
776
+ default_selection=['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
777
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']]
778
+ )
779
+
780
+ # About tab
781
+ with gr.TabItem("│", interactive=False):
782
+ gr.Markdown("", elem_classes="markdown-text")
783
+
784
+ # Task-specific leaderboards
785
+ for task, metadata in TASK_METADATA_GENERATIVE.items():
786
+ with gr.TabItem(f"{metadata['icon']}{task}"):
787
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
788
+ gr.Markdown(task_description, elem_classes="markdown-text")
789
+
790
+ leaderboard = update_task_leaderboard(
791
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
792
+ f"{task} Prompt Std": "Prompt Std",
793
+ f"{task} Best Prompt": "Best Prompt",
794
+ f"{task} Best Prompt Id": "Best Prompt Id",
795
+ task: "Combined Performance"}),
796
+ default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt',
797
+ 'Best Prompt Id'],
798
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
799
+ col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std',
800
+ 'Best Prompt', 'Best Prompt Id']]
801
+ )
802
+
803
+ # Citation section
804
+ with gr.Accordion("📙 Citation", open=False):
805
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
806
+
807
+ with gr.Accordion("📙 Credits", open=False):
808
+ gr.Markdown(
809
+ """
810
+ **This project has benefited from the following support:**
811
+
812
+ - 🧠 **Codebase**: Based on and extended from the Open Italian LLM Leaderboard, developed by **Alessandro Ercolani** and **Samuele Colombo**. We warmly thank them for their invaluable support and guidance in implementing this leaderboard.
813
+
814
+ - 💶 **Funding**: Partially supported by the PNRR project **FAIR - Future AI Research (PE00000013)**, under the NRRP MUR program funded by **NextGenerationEU**.
815
+
816
+ - 🖥️ **Computation**: We gratefully acknowledge **CINECA** for granting access to the **LEONARDO** supercomputer.
817
+ """
818
+ )
819
+
820
+ # Background job to restart space
821
+ scheduler = BackgroundScheduler()
822
+ scheduler.add_job(restart_space, "interval", seconds=1800)
823
+ scheduler.start()
824
+
825
+ # Launch the app with concurrent queueing
826
+ demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode
827
+ show_error=True)
e3c_llm_requests/.gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
e3c_llm_requests/Henrychur/MMed-Llama-3-8B.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Henrychur/MMed-Llama-3-8B",
3
+ "base_model": "LlamaForCausalLM",
4
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
5
+ "submitted_time": "2024-05-22 09:17:24+00:00",
6
+ "num_params_billion": null,
7
+ "language": "en_zh_ja_fr_ru_es"
8
+ }
e3c_llm_requests/HiTZ/Medical-mT5-large.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "HiTZ/Medical-mT5-large",
3
+ "base_model": "MT5ForConditionalGeneration",
4
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
5
+ "submitted_time": "2023-10-31 15:15:15+00:00",
6
+ "num_params_billion": null,
7
+ "language": "en_es_fr_it"
8
+ }
e3c_llm_requests/Qwen/Qwen2.5-14B-Instruct-1M.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen/Qwen2.5-14B-Instruct-1M",
3
+ "base_model": "Qwen2ForCausalLM",
4
+ "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438",
5
+ "submitted_time": "2025-01-23 13:23:24+00:00",
6
+ "num_params_billion": 14.770033664,
7
+ "language": "en"
8
+ }
e3c_llm_requests/Qwen/Qwen2.5-32B-Instruct.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen/Qwen2.5-32B-Instruct",
3
+ "base_model": "Qwen2ForCausalLM",
4
+ "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd",
5
+ "submitted_time": "2024-09-17 04:17:55+00:00",
6
+ "num_params_billion": 32.763876352,
7
+ "language": "en"
8
+ }
e3c_llm_requests/Qwen/Qwen3-30B-A3B-Instruct-2507.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen/Qwen3-30B-A3B-Instruct-2507",
3
+ "base_model": "Qwen3MoeForCausalLM",
4
+ "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad",
5
+ "submitted_time": "2025-07-28 07:31:27+00:00",
6
+ "num_params_billion": 30.532122624,
7
+ "language": ""
8
+ }
e3c_llm_requests/deepseek-ai/.ipynb_checkpoints/DeepSeek-R1-Distill-Qwen-32B-checkpoint.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
3
+ "base_model": "Qwen2ForCausalLM",
4
+ "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746",
5
+ "submitted_time": "2025-01-20 09:19:00+00:00",
6
+ "num_params_billion": 32.763876352,
7
+ "language": ""
8
+ }
e3c_llm_requests/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
3
+ "base_model": "Qwen2ForCausalLM",
4
+ "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746",
5
+ "submitted_time": "2025-01-20 09:19:00+00:00",
6
+ "num_params_billion": 32.763876352,
7
+ "language": ""
8
+ }
e3c_llm_requests/epfl-llm/meditron-7b.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "epfl-llm/meditron-7b",
3
+ "base_model": "LlamaForCausalLM",
4
+ "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76",
5
+ "submitted_time": "2023-11-08 16:03:23+00:00",
6
+ "num_params_billion": 6.73855488,
7
+ "language": "en"
8
+ }
e3c_llm_requests/google/gemma-2-9b-it.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "google/gemma-2-9b-it",
3
+ "base_model": "Gemma2ForCausalLM",
4
+ "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819",
5
+ "submitted_time": "2024-06-24 08:05:41+00:00",
6
+ "num_params_billion": 9.241705984,
7
+ "language": ""
8
+ }
e3c_llm_requests/google/gemma-3-27b-it.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "google/gemma-3-27b-it",
3
+ "base_model": "Gemma3ForConditionalGeneration",
4
+ "revision": "005ad3404e59d6023443cb575daa05336842228a",
5
+ "submitted_time": "2025-03-01 19:10:19+00:00",
6
+ "num_params_billion": 27.43240664,
7
+ "language": ""
8
+ }
e3c_llm_requests/google/medgemma-27b-text-it.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "google/medgemma-27b-text-it",
3
+ "base_model": "Gemma3ForCausalLM",
4
+ "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d",
5
+ "submitted_time": "2025-05-19 20:53:04+00:00",
6
+ "num_params_billion": 27.00900224,
7
+ "language": ""
8
+ }
e3c_llm_requests/google/medgemma-4b-it.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "google/medgemma-4b-it",
3
+ "base_model": "Gemma3ForConditionalGeneration",
4
+ "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d",
5
+ "submitted_time": "2025-05-19 20:52:44+00:00",
6
+ "num_params_billion": 4.300079472,
7
+ "language": ""
8
+ }
e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "meta-llama/Llama-3.2-1B-Instruct",
3
+ "base_model": "LlamaForCausalLM",
4
+ "revision": "9213176726f574b556790deb65791e0c5aa438b6",
5
+ "submitted_time": "2024-09-18 15:12:47+00:00",
6
+ "num_params_billion": 1.2358144,
7
+ "language": "en_de_fr_it_pt_hi_es_th"
8
+ }
e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "meta-llama/Llama-3.2-1B-Instruct",
3
+ "base_model": "LlamaForCausalLM",
4
+ "revision": "9213176726f574b556790deb65791e0c5aa438b6",
5
+ "submitted_time": "2024-09-18 15:12:47+00:00",
6
+ "num_params_billion": 1.2358144,
7
+ "language": "en_de_fr_it_pt_hi_es_th"
8
+ }
e3c_llm_requests/microsoft/MediPhi-Clinical.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "microsoft/MediPhi-Clinical",
3
+ "base_model": "Phi3ForCausalLM",
4
+ "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0",
5
+ "submitted_time": "2025-05-29 20:40:05+00:00",
6
+ "num_params_billion": 3.821079552,
7
+ "language": "en"
8
+ }
e3c_llm_requests/microsoft/MediPhi-Instruct.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "microsoft/MediPhi-Instruct",
3
+ "base_model": "Phi3ForCausalLM",
4
+ "revision": "a94ac478e7c246103d55665a0804684042f3b973",
5
+ "submitted_time": "2025-07-11 19:28:15+00:00",
6
+ "num_params_billion": 3.821079552,
7
+ "language": "en"
8
+ }
e3c_llm_requests/mistralai/Mistral-7B-Instruct-v0.2.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "mistralai/Mistral-7B-Instruct-v0.2",
3
+ "base_model": "MistralForCausalLM",
4
+ "revision": "63a8b081895390a26e140280378bc85ec8bce07a",
5
+ "submitted_time": "2023-12-11 13:18:44+00:00",
6
+ "num_params_billion": 7.241732096,
7
+ "language": ""
8
+ }
e3c_llm_requests/mistralai/Mistral-Nemo-Instruct-2407.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "mistralai/Mistral-Nemo-Instruct-2407",
3
+ "base_model": "MistralForCausalLM",
4
+ "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3",
5
+ "submitted_time": "2024-07-17 17:26:49+00:00",
6
+ "num_params_billion": 12.2477824,
7
+ "language": "en_fr_de_es_it_pt_ru_zh_ja"
8
+ }
e3c_llm_requests/tiiuae/Falcon3-10B-Instruct.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "tiiuae/Falcon3-10B-Instruct",
3
+ "base_model": "LlamaForCausalLM",
4
+ "revision": "8799bc6aec0152757221dc6b272d824642db6202",
5
+ "submitted_time": "2024-12-14 05:17:25+00:00",
6
+ "num_params_billion": 10.30565376,
7
+ "language": ""
8
+ }
e3c_llm_requests/unsloth/phi-4.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "unsloth/phi-4",
3
+ "base_model": "LlamaForCausalLM",
4
+ "revision": "c6220bde10fff762dbd72c3331894aa4cade249d",
5
+ "submitted_time": "2025-01-08 21:56:16+00:00",
6
+ "num_params_billion": 14.6595072,
7
+ "language": "en"
8
+ }
e3c_llm_results/.gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_EN.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 20.954842,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "EN",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 6.29,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 10.41,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 10.83,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 9.176666666666668,
38
+ "best_prompt": 10.83,
39
+ "prompt_id": "p3",
40
+ "CPS": 10.650944
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 12.870000000000001,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 33.94,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 31.31,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 26.040000000000003,
64
+ "best_prompt": 33.94,
65
+ "prompt_id": "p2",
66
+ "CPS": 31.25874
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_GR.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 8.314364166666667,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "GR",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 6.2,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 5.92,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 6.2,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 6.1066666666666665,
38
+ "best_prompt": 6.2,
39
+ "prompt_id": "p1",
40
+ "CPS": 6.194213333333334
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 10.17,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 5.06,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 10.65,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 8.626666666666667,
64
+ "best_prompt": 10.65,
65
+ "prompt_id": "p3",
66
+ "CPS": 10.434515000000001
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_IT.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 12.534040833333332,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "IT",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 4.35,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 4.29,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 3.84,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 4.16,
38
+ "best_prompt": 4.35,
39
+ "prompt_id": "p1",
40
+ "CPS": 4.341735
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 6.72,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 22.66,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 13.0,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 14.126666666666667,
64
+ "best_prompt": 22.66,
65
+ "prompt_id": "p2",
66
+ "CPS": 20.726346666666664
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_PL.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 8.100043833333334,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "PL",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 3.7900000000000005,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 3.7800000000000002,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 3.7900000000000005,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 3.786666666666667,
38
+ "best_prompt": 3.7900000000000005,
39
+ "prompt_id": "p1",
40
+ "CPS": 3.7898736666666673
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 6.02,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 12.93,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 7.779999999999999,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 8.909999999999998,
64
+ "best_prompt": 12.93,
65
+ "prompt_id": "p2",
66
+ "CPS": 12.410214
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SK.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 3.3197085,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "SK",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 3.8699999999999997,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 3.8,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 3.8699999999999997,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 3.8466666666666662,
38
+ "best_prompt": 3.8699999999999997,
39
+ "prompt_id": "p1",
40
+ "CPS": 3.8690969999999996
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 1.21,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 2.8000000000000003,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 1.21,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 1.7400000000000002,
64
+ "best_prompt": 2.8000000000000003,
65
+ "prompt_id": "p2",
66
+ "CPS": 2.77032
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SL.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 11.184996000000002,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "SL",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 4.29,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 4.5600000000000005,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 4.29,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 4.38,
38
+ "best_prompt": 4.5600000000000005,
39
+ "prompt_id": "p2",
40
+ "CPS": 4.551792000000001
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 9.67,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 19.0,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 9.67,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 12.780000000000001,
64
+ "best_prompt": 19.0,
65
+ "prompt_id": "p2",
66
+ "CPS": 17.8182
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_EN.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 21.475744333333335,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "EN",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 21.89,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 22.43,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 19.939999999999998,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 21.419999999999998,
38
+ "best_prompt": 22.43,
39
+ "prompt_id": "p2",
40
+ "CPS": 22.203457
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 11.89,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 16.68,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 21.85,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 16.80666666666667,
64
+ "best_prompt": 21.85,
65
+ "prompt_id": "p3",
66
+ "CPS": 20.74803166666667
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_GR.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 13.395712833333334,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "GR",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 16.669999999999998,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 10.89,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 16.669999999999998,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 14.743333333333332,
38
+ "best_prompt": 16.669999999999998,
39
+ "prompt_id": "p1",
40
+ "CPS": 16.348824666666665
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 8.21,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 10.530000000000001,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 10.36,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 9.700000000000001,
64
+ "best_prompt": 10.530000000000001,
65
+ "prompt_id": "p2",
66
+ "CPS": 10.442601000000002
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_IT.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 28.10758633333333,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "IT",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 32.99,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 40.23,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 39.379999999999995,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 37.53333333333333,
38
+ "best_prompt": 40.23,
39
+ "prompt_id": "p2",
40
+ "CPS": 39.145131
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 9.77,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 12.26,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 17.89,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 13.306666666666667,
64
+ "best_prompt": 17.89,
65
+ "prompt_id": "p3",
66
+ "CPS": 17.070041666666665
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_PL.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 25.157004666666666,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "PL",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 39.92,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 39.160000000000004,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 39.92,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 39.666666666666664,
38
+ "best_prompt": 39.92,
39
+ "prompt_id": "p1",
40
+ "CPS": 39.81886933333333
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 9.98,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 10.549999999999999,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 9.56,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 10.03,
64
+ "best_prompt": 10.549999999999999,
65
+ "prompt_id": "p2",
66
+ "CPS": 10.49514
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SK.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 23.0736205,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "SK",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 34.44,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 36.32,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 34.44,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 35.06666666666666,
38
+ "best_prompt": 36.32,
39
+ "prompt_id": "p2",
40
+ "CPS": 35.864789333333334
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 7.340000000000001,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 10.45,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 8.75,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 8.846666666666666,
64
+ "best_prompt": 10.45,
65
+ "prompt_id": "p2",
66
+ "CPS": 10.282451666666665
67
+ }
68
+ }
69
+ }
e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SL.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 23.493655333333336,
3
+ "config": {
4
+ "model_name": "Henrychur/MMed-Llama-3-8B",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "SL",
8
+ "model": "Henrychur/MMed-Llama-3-8B",
9
+ "base_model": "LlamaForCausalLM",
10
+ "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943",
11
+ "submitted_time": "2024-05-22 09:17:24+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_zh_ja_fr_ru_es"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 35.58,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 40.45,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 35.58,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 37.20333333333333,
38
+ "best_prompt": 40.45,
39
+ "prompt_id": "p2",
40
+ "CPS": 39.136723333333336
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 7.870000000000001,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 7.8100000000000005,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 7.19,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 7.623333333333334,
64
+ "best_prompt": 7.870000000000001,
65
+ "prompt_id": "p1",
66
+ "CPS": 7.850587333333335
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_0_EN.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 4.530016666666667,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "EN",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 9.4,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 3.3099999999999996,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 4.64,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 5.783333333333334,
38
+ "best_prompt": 9.4,
39
+ "prompt_id": "p1",
40
+ "CPS": 9.060033333333333
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.0,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.0,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.0,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.0,
64
+ "best_prompt": 0.0,
65
+ "prompt_id": "p1",
66
+ "CPS": 0.0
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_0_GR.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 4.256631333333333,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "GR",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 8.59,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 5.91,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 8.59,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 7.696666666666666,
38
+ "best_prompt": 8.59,
39
+ "prompt_id": "p1",
40
+ "CPS": 8.513262666666666
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.0,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.0,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.0,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.0,
64
+ "best_prompt": 0.0,
65
+ "prompt_id": "p1",
66
+ "CPS": 0.0
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_0_IT.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 4.550473333333333,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "IT",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 7.7,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 9.2,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 7.470000000000001,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 8.123333333333333,
38
+ "best_prompt": 9.2,
39
+ "prompt_id": "p2",
40
+ "CPS": 9.100946666666665
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p2",
46
+ "metric": "f1",
47
+ "value": 0.0,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p3",
52
+ "metric": "f1",
53
+ "value": 0.0,
54
+ "stderr": 0.0
55
+ }
56
+ ],
57
+ "average_accuracy": 0.0,
58
+ "best_prompt": 0.0,
59
+ "prompt_id": "p2",
60
+ "CPS": 0.0
61
+ }
62
+ }
63
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_0_PL.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 2.1520960000000002,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "PL",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 2.44,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 4.36,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 2.44,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 3.08,
38
+ "best_prompt": 4.36,
39
+ "prompt_id": "p2",
40
+ "CPS": 4.3041920000000005
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.0,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.0,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.0,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.0,
64
+ "best_prompt": 0.0,
65
+ "prompt_id": "p1",
66
+ "CPS": 0.0
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_0_SK.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 4.3259333333333325,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "SK",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 8.799999999999999,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 3.75,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 8.799999999999999,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 7.116666666666666,
38
+ "best_prompt": 8.799999999999999,
39
+ "prompt_id": "p1",
40
+ "CPS": 8.651866666666665
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.0,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.0,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.0,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.0,
64
+ "best_prompt": 0.0,
65
+ "prompt_id": "p1",
66
+ "CPS": 0.0
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_0_SL.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 3.859359,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "SL",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 7.7700000000000005,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 5.79,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 7.7700000000000005,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 7.11,
38
+ "best_prompt": 7.7700000000000005,
39
+ "prompt_id": "p1",
40
+ "CPS": 7.718718
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.0,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.0,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.0,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.0,
64
+ "best_prompt": 0.0,
65
+ "prompt_id": "p1",
66
+ "CPS": 0.0
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_10_EN.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 7.250459833333332,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "EN",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 12.15,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 14.149999999999999,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 13.22,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 13.173333333333332,
38
+ "best_prompt": 14.149999999999999,
39
+ "prompt_id": "p2",
40
+ "CPS": 14.011801666666665
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.27999999999999997,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.16,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.49,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.31,
64
+ "best_prompt": 0.49,
65
+ "prompt_id": "p3",
66
+ "CPS": 0.489118
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_10_GR.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 7.3897435,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "GR",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 14.549999999999999,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 14.34,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 14.549999999999999,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 14.479999999999999,
38
+ "best_prompt": 14.549999999999999,
39
+ "prompt_id": "p1",
40
+ "CPS": 14.539814999999999
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.24,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.06999999999999999,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.0,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.10333333333333333,
64
+ "best_prompt": 0.24,
65
+ "prompt_id": "p1",
66
+ "CPS": 0.239672
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_10_IT.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 9.117947333333333,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "IT",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 16.16,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 17.740000000000002,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 16.900000000000002,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 16.933333333333334,
38
+ "best_prompt": 17.740000000000002,
39
+ "prompt_id": "p2",
40
+ "CPS": 17.596897333333335
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.35000000000000003,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.64,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.45999999999999996,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.48333333333333334,
64
+ "best_prompt": 0.64,
65
+ "prompt_id": "p2",
66
+ "CPS": 0.6389973333333333
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_10_PL.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 7.915078666666666,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "PL",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 15.0,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 15.479999999999999,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 15.0,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 15.159999999999998,
38
+ "best_prompt": 15.479999999999999,
39
+ "prompt_id": "p2",
40
+ "CPS": 15.430463999999999
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.4,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.22999999999999998,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.33999999999999997,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.3233333333333333,
64
+ "best_prompt": 0.4,
65
+ "prompt_id": "p1",
66
+ "CPS": 0.39969333333333334
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_10_SK.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 7.5838598333333325,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "SK",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 14.85,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 13.600000000000001,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 14.85,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 14.433333333333332,
38
+ "best_prompt": 14.85,
39
+ "prompt_id": "p1",
40
+ "CPS": 14.788124999999999
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.38,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.24,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.2,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.2733333333333334,
64
+ "best_prompt": 0.38,
65
+ "prompt_id": "p1",
66
+ "CPS": 0.3795946666666667
67
+ }
68
+ }
69
+ }
e3c_llm_results/HiTZ/Medical-mT5-large_10_SL.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 7.7788705,
3
+ "config": {
4
+ "model_name": "HiTZ/Medical-mT5-large",
5
+ "num_fewshot": "10",
6
+ "batch_size": 1,
7
+ "LANG": "SL",
8
+ "model": "HiTZ/Medical-mT5-large",
9
+ "base_model": "MT5ForConditionalGeneration",
10
+ "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47",
11
+ "submitted_time": "2023-10-31 15:15:15+00:00",
12
+ "num_params_billion": null,
13
+ "language": "en_es_fr_it"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 14.7,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 13.25,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 14.7,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 14.216666666666667,
38
+ "best_prompt": 14.7,
39
+ "prompt_id": "p1",
40
+ "CPS": 14.62895
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 0.73,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 0.74,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 0.9299999999999999,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 0.7999999999999999,
64
+ "best_prompt": 0.9299999999999999,
65
+ "prompt_id": "p3",
66
+ "CPS": 0.9287909999999999
67
+ }
68
+ }
69
+ }
e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_EN.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 36.19732933333333,
3
+ "config": {
4
+ "model_name": "Qwen/Qwen2.5-14B-Instruct-1M",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "EN",
8
+ "model": "Qwen/Qwen2.5-14B-Instruct-1M",
9
+ "base_model": "Qwen2ForCausalLM",
10
+ "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438",
11
+ "submitted_time": "2025-01-23 13:23:24+00:00",
12
+ "num_params_billion": 14.770033664,
13
+ "language": "en"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 34.25,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 11.81,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 28.93,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 24.996666666666666,
38
+ "best_prompt": 34.25,
39
+ "prompt_id": "p1",
40
+ "CPS": 31.08073333333333
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 41.349999999999994,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 39.17,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 41.72,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 40.74666666666666,
64
+ "best_prompt": 41.72,
65
+ "prompt_id": "p3",
66
+ "CPS": 41.31392533333333
67
+ }
68
+ }
69
+ }
e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_GR.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 27.333585333333332,
3
+ "config": {
4
+ "model_name": "Qwen/Qwen2.5-14B-Instruct-1M",
5
+ "num_fewshot": "0",
6
+ "batch_size": 1,
7
+ "LANG": "GR",
8
+ "model": "Qwen/Qwen2.5-14B-Instruct-1M",
9
+ "base_model": "Qwen2ForCausalLM",
10
+ "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438",
11
+ "submitted_time": "2025-01-23 13:23:24+00:00",
12
+ "num_params_billion": 14.770033664,
13
+ "language": "en"
14
+ },
15
+ "tasks": {
16
+ "NER": {
17
+ "prompts": [
18
+ {
19
+ "prompt": "p1",
20
+ "metric": "f1",
21
+ "value": 13.389999999999999,
22
+ "stderr": 0.0
23
+ },
24
+ {
25
+ "prompt": "p2",
26
+ "metric": "f1",
27
+ "value": 11.91,
28
+ "stderr": 0.0
29
+ },
30
+ {
31
+ "prompt": "p3",
32
+ "metric": "f1",
33
+ "value": 13.389999999999999,
34
+ "stderr": 0.0
35
+ }
36
+ ],
37
+ "average_accuracy": 12.896666666666667,
38
+ "best_prompt": 13.389999999999999,
39
+ "prompt_id": "p1",
40
+ "CPS": 13.323942666666666
41
+ },
42
+ "RE": {
43
+ "prompts": [
44
+ {
45
+ "prompt": "p1",
46
+ "metric": "f1",
47
+ "value": 37.96,
48
+ "stderr": 0.0
49
+ },
50
+ {
51
+ "prompt": "p2",
52
+ "metric": "f1",
53
+ "value": 42.66,
54
+ "stderr": 0.0
55
+ },
56
+ {
57
+ "prompt": "p3",
58
+ "metric": "f1",
59
+ "value": 38.1,
60
+ "stderr": 0.0
61
+ }
62
+ ],
63
+ "average_accuracy": 39.57333333333333,
64
+ "best_prompt": 42.66,
65
+ "prompt_id": "p2",
66
+ "CPS": 41.343227999999996
67
+ }
68
+ }
69
+ }