Sfarzi commited on
Commit
f7a50a0
·
1 Parent(s): 02fbbb9

Initial clone with modifications

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .ipynb_checkpoints/preprocess_models_output-checkpoint.py +264 -0
  2. app.py +37 -49
  3. csv_new/llm_scores_p1_final.xlsx +0 -0
  4. csv_new/llm_scores_p2_final.xlsx +0 -0
  5. csv_new/llm_scores_p3_final.xlsx +0 -0
  6. csv_new/output/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt +23 -0
  7. csv_new/output/.ipynb_checkpoints/epfl-llm__meditron-7b__gr__0shot-checkpoint.txt +11 -0
  8. csv_new/output/Henrychur__MMed-Llama-3-8B__en__0shot.txt +23 -0
  9. csv_new/output/Henrychur__MMed-Llama-3-8B__en__10shot.txt +23 -0
  10. csv_new/output/Henrychur__MMed-Llama-3-8B__gr__0shot.txt +11 -0
  11. csv_new/output/Henrychur__MMed-Llama-3-8B__gr__10shot.txt +11 -0
  12. csv_new/output/Henrychur__MMed-Llama-3-8B__it__0shot.txt +23 -0
  13. csv_new/output/Henrychur__MMed-Llama-3-8B__it__10shot.txt +23 -0
  14. csv_new/output/Henrychur__MMed-Llama-3-8B__pl__0shot.txt +11 -0
  15. csv_new/output/Henrychur__MMed-Llama-3-8B__pl__10shot.txt +11 -0
  16. csv_new/output/Henrychur__MMed-Llama-3-8B__sk__0shot.txt +11 -0
  17. csv_new/output/Henrychur__MMed-Llama-3-8B__sk__10shot.txt +11 -0
  18. csv_new/output/Henrychur__MMed-Llama-3-8B__sl__0shot.txt +11 -0
  19. csv_new/output/Henrychur__MMed-Llama-3-8B__sl__10shot.txt +11 -0
  20. csv_new/output/HiTZ__Medical-mT5-large__en__0shot.txt +23 -0
  21. csv_new/output/HiTZ__Medical-mT5-large__en__10shot.txt +23 -0
  22. csv_new/output/HiTZ__Medical-mT5-large__gr__0shot.txt +11 -0
  23. csv_new/output/HiTZ__Medical-mT5-large__gr__10shot.txt +11 -0
  24. csv_new/output/HiTZ__Medical-mT5-large__it__0shot.txt +22 -0
  25. csv_new/output/HiTZ__Medical-mT5-large__it__10shot.txt +23 -0
  26. csv_new/output/HiTZ__Medical-mT5-large__pl__0shot.txt +11 -0
  27. csv_new/output/HiTZ__Medical-mT5-large__pl__10shot.txt +11 -0
  28. csv_new/output/HiTZ__Medical-mT5-large__sk__0shot.txt +11 -0
  29. csv_new/output/HiTZ__Medical-mT5-large__sk__10shot.txt +11 -0
  30. csv_new/output/HiTZ__Medical-mT5-large__sl__0shot.txt +11 -0
  31. csv_new/output/HiTZ__Medical-mT5-large__sl__10shot.txt +11 -0
  32. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt +23 -0
  33. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt +23 -0
  34. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt +11 -0
  35. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt +11 -0
  36. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt +23 -0
  37. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt +23 -0
  38. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt +11 -0
  39. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt +11 -0
  40. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt +11 -0
  41. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt +11 -0
  42. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt +11 -0
  43. csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt +11 -0
  44. csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt +25 -0
  45. csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt +24 -0
  46. csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt +11 -0
  47. csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt +11 -0
  48. csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt +24 -0
  49. csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt +24 -0
  50. csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt +11 -0
.ipynb_checkpoints/preprocess_models_output-checkpoint.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ EVALITA LLM EVALUATION PROCESSOR
3
+
4
+ Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.
5
+
6
+ DATA PIPELINE OVERVIEW:
7
+
8
+ 1. Inputs:
9
+ - Evaluation Results: Raw .out files from lm-eval-harness
10
+ - Model Metadata: Pre-collected .json files from HuggingFace
11
+
12
+ 2. Output:
13
+ - Comprehensive evaluation reports in JSON format
14
+ - Ready for ingestion into the evaluation leaderboard
15
+
16
+ --------------------------------------------------------------------
17
+ INPUT SPECIFICATION
18
+
19
+ Evaluation Results (.out format):
20
+ hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
21
+ | Task | Metric | Value | Stderr |
22
+ |---------------|--------|--------|--------|
23
+ | main-task | acc | 0.5605 | 0.0052 |
24
+ | - sub-task | acc | 0.4640 | 0.0088 |
25
+ | - prompt-1 | acc | 0.3720 | 0.0216 |
26
+
27
+ Model Metadata (.json format):
28
+ {
29
+ "model": "model-org/model-name",
30
+ "base_model": "ModelArchitecture",
31
+ "revision": "git_commit_hash",
32
+ "parameters": 8.03,
33
+ "language": "en_it"
34
+ }
35
+
36
+ --------------------------------------------------------------------
37
+ OUTPUT SPECIFICATION
38
+
39
+ Evaluation Report (.json format):
40
+ {
41
+ "summary_metrics": {
42
+ "average_CPS": 41.74,
43
+ "num_tasks": 12
44
+ },
45
+ "model_config": {
46
+ "identifier": "model-org/model-name",
47
+ "architecture": "ModelArchitecture",
48
+ "parameters": 8.03,
49
+ "evaluation_settings": {
50
+ "fewshot": 5,
51
+ "batch_size": 1
52
+ }
53
+ },
54
+ "task_results": {
55
+ "task-name": {
56
+ "average_score": 52.60,
57
+ "best_prompt": {
58
+ "id": "prompt-6",
59
+ "score": 66.57
60
+ },
61
+ "prompt_analysis": [
62
+ {
63
+ "prompt_id": "prompt-1",
64
+ "score": 37.20,
65
+ "stderr": 0.0216
66
+ }
67
+ ]
68
+ }
69
+ }
70
+ }
71
+ """
72
+
73
+ import json
74
+ import os
75
+ import re
76
+ import statistics
77
+
78
+
79
+ def safe_float(value):
80
+ """Safely converts a value to float, returning None if the conversion fails."""
81
+ try:
82
+ return float(value)
83
+ except ValueError:
84
+ return None
85
+
86
+
87
+ def calculate_task_metrics(task_info):
88
+ """Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
89
+ accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
90
+
91
+ if not accuracies:
92
+ return None
93
+
94
+ task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
95
+ task_info['std_accuracy'] = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0
96
+ best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
97
+ task_info['best_prompt'] = best_prompt_data['value']
98
+ task_info['prompt_id'] = best_prompt_data['prompt']
99
+
100
+ # Calculate CPS
101
+ avg_acc = task_info['average_accuracy']
102
+ best_acc = task_info['best_prompt']
103
+ task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
104
+
105
+
106
+ def extract_data_from_file(file_path):
107
+ """Extracts task and prompt data from a specified file."""
108
+ LANG=""
109
+ if file_path.find ("__en__")!=-1 : LANG="EN"
110
+ if file_path.find ("__sl__")!=-1 : LANG="SL"
111
+ if file_path.find ("__it__")!=-1 : LANG="IT"
112
+ if file_path.find ("__gr__")!=-1 : LANG="GR"
113
+ if file_path.find ("__sk__")!=-1 : LANG="SK"
114
+ if file_path.find ("__pl__")!=-1 : LANG="PL"
115
+ if LANG=="" :
116
+ print ("ERROR: ",file_path)
117
+
118
+ with open(file_path, 'r') as file:
119
+ lines = file.readlines()
120
+
121
+ tasks_data = {}
122
+ current_task = None
123
+
124
+ for line in lines:
125
+ line = line.strip()
126
+
127
+ # Skips empty lines
128
+ if not line:
129
+ continue
130
+
131
+ # Skips header lines
132
+ if line.startswith("| Tasks") or line.startswith(" | Task"):
133
+ continue
134
+
135
+ # Extracts model configuration details
136
+ if line.startswith("hf (pretrained=") or line.startswith("hf(pretrained="):
137
+ start = line.find("pretrained=") + len("pretrained=")
138
+ end = line.find(" )", start)
139
+ pretrained_model = line[start:end]
140
+
141
+ num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
142
+ num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
143
+
144
+ batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
145
+ batch_size = int(batch_size_match.group(1)) if batch_size_match else None
146
+
147
+ continue
148
+
149
+ columns = line.split('|')
150
+ if len(columns) != 11:
151
+ continue
152
+ print (columns)
153
+ task_name = columns[1]
154
+ metric = columns[5].strip()
155
+ value = safe_float(columns[7])
156
+ stderr = safe_float(columns[9])
157
+ print (value)
158
+ # Skips normalized accuracy metrics
159
+ if metric == "acc_norm":
160
+ continue
161
+
162
+ # Identifies task and prompt sections in the file
163
+ if task_name.startswith(" - "):
164
+ task_name = task_name[3:].strip()
165
+ current_task = task_name
166
+ tasks_data.setdefault(current_task,
167
+ {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
168
+ 'CPS': None})
169
+
170
+ elif task_name.startswith(" - ") and current_task:
171
+ prompt_name = task_name[4:].strip()
172
+ prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
173
+ 'stderr': stderr}
174
+ tasks_data[current_task]['prompts'].append(prompt_data)
175
+
176
+ # Special handling for evalita NER task to calculate weighted prompt averages
177
+ if "evalita NER" in tasks_data:
178
+ task_info = tasks_data["evalita NER"]
179
+ weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
180
+ "WN prompt-1": 2088, "WN prompt-2": 2088}
181
+
182
+ weighted_values = {"prompt-1": 0, "prompt-2": 0}
183
+ total_weights = sum(weight_map.values())
184
+
185
+ for prompt in task_info['prompts']:
186
+ if prompt['prompt'] in weight_map:
187
+ if "prompt-1" in prompt['prompt']:
188
+ weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
189
+ elif "prompt-2" in prompt['prompt']:
190
+ weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
191
+
192
+ task_info['prompts'] = [
193
+ {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
194
+ 'stderr': None},
195
+ {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
196
+ 'stderr': None}]
197
+
198
+ # Calculates task metrics for each task
199
+ for task_info in tasks_data.values():
200
+ calculate_task_metrics(task_info)
201
+
202
+ # Calculates the average CPS across all tasks
203
+ tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
204
+ average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
205
+
206
+ config = {
207
+ "model_name": pretrained_model,
208
+ "num_fewshot": num_fewshot,
209
+ "batch_size": batch_size,
210
+ "LANG": LANG
211
+ }
212
+
213
+ return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
214
+
215
+
216
+ """
217
+ MAIN PROCESSING PIPELINE
218
+
219
+ This script executes the complete evaluation data processing workflow:
220
+
221
+ 1. Input Sources:
222
+ - Raw evaluation results (.out files) from: ../evalita_llm_models_output/
223
+ - Model metadata JSON files from: ../evalita_llm_requests/
224
+
225
+ 2. Processing Steps:
226
+ - Parses evaluation metrics from .out files
227
+ - Combines with model metadata
228
+ - Calculates aggregated performance statistics
229
+
230
+ 3. Output:
231
+ - Structured JSON results saved to: ../evalita_llm_results/
232
+ - Organized by model organization/name
233
+ - Contains complete evaluation results with metadata
234
+ """
235
+ directory_in_path = '/home/sfarzi/leaderboard/MediLingua_Leaderboard/csv_files/outputs/'
236
+ directory_in_requests_path = '/home/sfarzi/leaderboard/MediLingua_Leaderboard/e3c_llm_requests/'
237
+ directory_out_results_path = '/home/sfarzi/leaderboard/MediLingua_Leaderboard/e3c_llm_results/'
238
+
239
+ for filename in os.listdir(directory_in_path):
240
+ if filename.endswith('.txt'):
241
+ file_path = os.path.join(directory_in_path, filename)
242
+ json_output = extract_data_from_file(file_path)
243
+
244
+ model_org_name, model_name = json_output['config']['model_name'].split('/')
245
+
246
+
247
+ config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")
248
+
249
+ if os.path.exists(config_file_path):
250
+ with open(config_file_path, 'r', encoding='utf-8') as config_file:
251
+ additional_config = json.load(config_file)
252
+ json_output['config'].update(additional_config)
253
+
254
+
255
+ org_folder_path = os.path.join(directory_out_results_path, model_org_name)
256
+ os.makedirs(org_folder_path, exist_ok=True)
257
+
258
+ file_suffix = f"{json_output['config']['num_fewshot']}" +"_"+ f"{json_output['config']['LANG']}"
259
+ output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
260
+
261
+ with open(output_file_path, 'w', newline="\n") as outfile:
262
+ json.dump(json_output, outfile, indent=4)
263
+
264
+ print(f"File {filename} processed and saved to {output_file_path}")
app.py CHANGED
@@ -23,7 +23,7 @@ import numpy as np
23
  # === NEW: helper for prompt sensitivity (simple: only NER/REL and 3 prompts) ===
24
  def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids):
25
  """
26
- Computes a simple Prompt Sensitivity Index (PSI) over the tasks (NER, REL)
27
  using the distribution of 'Best Prompt Id' across the provided prompt_ids.
28
  """
29
  cv_per_task = []
@@ -47,14 +47,14 @@ def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids):
47
 
48
  def create_best_model_comparison_table(dataframe, lang: str | None = None, shot: str | None = None):
49
  """
50
- Table with best overall model per task (NER, REL) and the model with the best prompt score.
51
  Applies optional filters:
52
  - lang in {EN, IT, SL, SK, GR, PL} or None/"All"
53
  - shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
54
  """
55
- tasks = ["NER", "REL"]
56
  df = dataframe.copy()
57
-
58
  if lang and lang != "All" and "LANG" in df.columns:
59
  df = df[df["LANG"] == lang]
60
  if shot and shot != "All" and "IS_FS" in df.columns:
@@ -66,8 +66,13 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
66
  if task not in df.columns or df.empty:
67
  continue
68
  # Best overall on task
69
- max_idx = df[task].idxmax()
70
- model_raw = df.loc[max_idx, 'Model']
 
 
 
 
 
71
  if isinstance(model_raw, str) and '<' in model_raw:
72
  match = re.search(r'>([^<]+)<', model_raw)
73
  model_name = match.group(1) if match else model_raw
@@ -78,8 +83,11 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
78
  # Best prompt row for task
79
  best_prompt_column = f"{task} Best Prompt"
80
  if best_prompt_column in df.columns and df[best_prompt_column].notna().any():
81
- best_prompt_idx = df[best_prompt_column].idxmax()
82
- best_prompt_model_raw = df.loc[best_prompt_idx, 'Model']
 
 
 
83
  if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw:
84
  match = re.search(r'>([^<]+)<', best_prompt_model_raw)
85
  best_prompt_model = match.group(1) if match else best_prompt_model_raw
@@ -118,7 +126,7 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
118
  subtitle.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
119
 
120
  fig.update_layout(
121
- title={'text': f"Top Model per Task: CPS & Best Prompt (NER/REL) — {', '.join(subtitle)}",
122
  'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
123
  font=dict(family="Arial", size=11),
124
  height=420, margin=dict(l=20, r=20, t=50, b=80)
@@ -130,10 +138,10 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
130
  # === NEW: Best-model comparison table (only NER, REL) ===
131
  def create_best_model_comparison_table_without_lang(dataframe):
132
  """
133
- Table with the best overall model per task (NER, REL) and the model that
134
  achieves the best score with its own best prompt.
135
  """
136
- tasks = ["NER", "REL"]
137
  table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
138
 
139
  for task in tasks:
@@ -208,7 +216,7 @@ def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None =
208
  - lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
209
  - shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
210
  """
211
- tasks = ["NER", "REL"]
212
 
213
  df = dataframe.copy()
214
  # Language filter
@@ -269,7 +277,7 @@ def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None =
269
  title_parts.append(lang if (lang and lang != "All") else "All languages")
270
  title_parts.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
271
  fig.update_layout(
272
- title={'text': f"Most Effective Prompts (NER/REL) — {', '.join(title_parts)}",
273
  'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
274
  xaxis_title="Task", yaxis_title="Prompt",
275
  font=dict(family="Arial", size=11), margin=dict(b=100),
@@ -286,7 +294,7 @@ def create_prompt_heatmap_without_lang(dataframe):
286
  for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
287
  ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
288
  """
289
- tasks = ["NER", "REL"]
290
 
291
  # Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts
292
  all_ids = set()
@@ -381,13 +389,13 @@ def mean_of_max_per_field(df):
381
  Calcola il massimo per ciascun campo e poi la media dei massimi.
382
 
383
  Args:
384
- df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL
385
 
386
  Returns:
387
  float: media dei valori massimi dei campi
388
  """
389
  #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
390
- fields = ["NER", "REL"]
391
  #print(df.columns)
392
 
393
  # Controlla che tutte le colonne esistano nel DataFrame
@@ -396,7 +404,7 @@ def mean_of_max_per_field(df):
396
  raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
397
 
398
  # Calcola il massimo per ciascun campo
399
- max_values = df[fields].max()
400
 
401
  # Calcola la media dei massimi
402
  mean_max = max_values.mean()
@@ -406,7 +414,7 @@ def mean_of_max_per_field(df):
406
 
407
  def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
408
  if tasks is None:
409
- tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
410
 
411
  task_means = {}
412
 
@@ -481,7 +489,7 @@ def boxplot_per_task(dataframe=None, baselines=None, references=None):
481
  #print(dataframe.columns)
482
 
483
  #tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
484
- tasks =["NER", "REL"]
485
  if dataframe is None:
486
  np.random.seed(42)
487
  dataframe = pd.DataFrame({
@@ -799,10 +807,12 @@ TASK_METADATA_MULTIPLECHOICE = {
799
 
800
  # Define task metadata (icons, names, descriptions)
801
  TASK_METADATA_GENERATIVE = {
802
- #"LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
803
- #"SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
804
  "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
805
  "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
 
 
 
806
  }
807
 
808
  def restart_space():
@@ -895,8 +905,10 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
895
  """
896
  if dataframe is None or dataframe.empty:
897
  raise ValueError("Leaderboard DataFrame is empty or None.")
898
-
899
- sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
 
 
900
 
901
  # aggiungo la colonna rank in base alla posizione
902
  sorted_dataframe = sorted_dataframe.reset_index(drop=True)
@@ -972,31 +984,7 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
972
  interactive=False
973
  )
974
 
975
- '''
976
- # Helper function for leaderboard initialization
977
- def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
978
- """Initialize and return a leaderboard."""
979
- if dataframe is None or dataframe.empty:
980
- raise ValueError("Leaderboard DataFrame is empty or None.")
981
 
982
- return Leaderboard(
983
- value=dataframe,
984
- datatype=[c.type for c in fields(AutoEvalColumn)],
985
- select_columns=SelectColumns(
986
- default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
987
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
988
- label="Select Columns to Display:",
989
- ),
990
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
991
- hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
992
- filter_columns=[
993
- ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
994
- ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
995
- ],
996
- bool_checkboxgroup_label="Hide models",
997
- interactive=False,
998
- )
999
- '''
1000
 
1001
  def download_snapshot(repo, local_dir):
1002
  """Try to download a snapshot from Hugging Face Hub."""
@@ -1087,8 +1075,8 @@ with demo:
1087
 
1088
  leaderboard = init_leaderboard(
1089
  LEADERBOARD_DF,
1090
- default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
1091
- hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
1092
  )
1093
 
1094
 
 
23
  # === NEW: helper for prompt sensitivity (simple: only NER/REL and 3 prompts) ===
24
  def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids):
25
  """
26
+ Computes a simple Prompt Sensitivity Index (PSI) over the tasks
27
  using the distribution of 'Best Prompt Id' across the provided prompt_ids.
28
  """
29
  cv_per_task = []
 
47
 
48
  def create_best_model_comparison_table(dataframe, lang: str | None = None, shot: str | None = None):
49
  """
50
+ Table with best overall model per task and the model with the best prompt score.
51
  Applies optional filters:
52
  - lang in {EN, IT, SL, SK, GR, PL} or None/"All"
53
  - shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
54
  """
55
+ tasks = ["NER", "REL", "RML", "HIS", "DIA"]
56
  df = dataframe.copy()
57
+
58
  if lang and lang != "All" and "LANG" in df.columns:
59
  df = df[df["LANG"] == lang]
60
  if shot and shot != "All" and "IS_FS" in df.columns:
 
66
  if task not in df.columns or df.empty:
67
  continue
68
  # Best overall on task
69
+ #max_idx = df[task].idxmax()
70
+ max_idx = pd.to_numeric(df[task], errors='coerce').idxmax()
71
+ try:
72
+ model_raw = df.loc[max_idx, 'Model']
73
+ except Exception as e:
74
+ break
75
+
76
  if isinstance(model_raw, str) and '<' in model_raw:
77
  match = re.search(r'>([^<]+)<', model_raw)
78
  model_name = match.group(1) if match else model_raw
 
83
  # Best prompt row for task
84
  best_prompt_column = f"{task} Best Prompt"
85
  if best_prompt_column in df.columns and df[best_prompt_column].notna().any():
86
+ best_prompt_idx= pd.to_numeric(df[best_prompt_column],errors='coerce').idxmax()
87
+ try:
88
+ best_prompt_model_raw = df.loc[best_prompt_idx, 'Model']
89
+ except Exception as e:
90
+ break
91
  if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw:
92
  match = re.search(r'>([^<]+)<', best_prompt_model_raw)
93
  best_prompt_model = match.group(1) if match else best_prompt_model_raw
 
126
  subtitle.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
127
 
128
  fig.update_layout(
129
+ title={'text': f"Top Model per Task: CPS & Best Prompt — {', '.join(subtitle)}",
130
  'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
131
  font=dict(family="Arial", size=11),
132
  height=420, margin=dict(l=20, r=20, t=50, b=80)
 
138
  # === NEW: Best-model comparison table (only NER, REL) ===
139
  def create_best_model_comparison_table_without_lang(dataframe):
140
  """
141
+ Table with the best overall model per task (NER, REL,) and the model that
142
  achieves the best score with its own best prompt.
143
  """
144
+ tasks = ["NER", "REL", "RML", "HIS", "DIA"]
145
  table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
146
 
147
  for task in tasks:
 
216
  - lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
217
  - shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
218
  """
219
+ tasks = ["NER", "REL", "RML", "HIS", "DIA"]
220
 
221
  df = dataframe.copy()
222
  # Language filter
 
277
  title_parts.append(lang if (lang and lang != "All") else "All languages")
278
  title_parts.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
279
  fig.update_layout(
280
+ title={'text': f"Most Effective Prompts — {', '.join(title_parts)}",
281
  'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
282
  xaxis_title="Task", yaxis_title="Prompt",
283
  font=dict(family="Arial", size=11), margin=dict(b=100),
 
294
  for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
295
  ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
296
  """
297
+ tasks = ["NER", "REL", "RML", "HIS", "DIA"]
298
 
299
  # Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts
300
  all_ids = set()
 
389
  Calcola il massimo per ciascun campo e poi la media dei massimi.
390
 
391
  Args:
392
+ df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL, RML, DIA, HIS
393
 
394
  Returns:
395
  float: media dei valori massimi dei campi
396
  """
397
  #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
398
+ fields = ["NER", "REL", "RML", "DIA", "HIS"]
399
  #print(df.columns)
400
 
401
  # Controlla che tutte le colonne esistano nel DataFrame
 
404
  raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
405
 
406
  # Calcola il massimo per ciascun campo
407
+ max_values = df[fields].apply(pd.to_numeric, errors='coerce').max(skipna=True)
408
 
409
  # Calcola la media dei massimi
410
  mean_max = max_values.mean()
 
414
 
415
  def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
416
  if tasks is None:
417
+ tasks = [ "NER", "REL", "RML", "DIA", "HIS"]
418
 
419
  task_means = {}
420
 
 
489
  #print(dataframe.columns)
490
 
491
  #tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
492
+ tasks =["NER", "REL", "RML", "HIS", "DIA"]
493
  if dataframe is None:
494
  np.random.seed(42)
495
  dataframe = pd.DataFrame({
 
807
 
808
  # Define task metadata (icons, names, descriptions)
809
  TASK_METADATA_GENERATIVE = {
810
+
 
811
  "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
812
  "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
813
+ "RML": {"icon": "😃", "name": "CRF RML", "tooltip": "CRF RML"},
814
+ "DIA": {"icon": "🏥", "name": "CRF Diagnosis", "tooltip": "CRF Diagnosis"},
815
+ "HIS": {"icon": "📝", "name": "CRF History", "tooltip": "CRF History"},
816
  }
817
 
818
  def restart_space():
 
905
  """
906
  if dataframe is None or dataframe.empty:
907
  raise ValueError("Leaderboard DataFrame is empty or None.")
908
+ #sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
909
+ clean_df = dataframe.assign( **{"Combined Performance": pd.to_numeric(dataframe["Combined Performance"], errors="coerce")}).loc[lambda df: df["Combined Performance"].notna() & (df["Combined Performance"] != 0)]
910
+
911
+ sorted_dataframe = clean_df.sort_values(by="Combined Performance", ascending=False)
912
 
913
  # aggiungo la colonna rank in base alla posizione
914
  sorted_dataframe = sorted_dataframe.reset_index(drop=True)
 
984
  interactive=False
985
  )
986
 
 
 
 
 
 
 
987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988
 
989
  def download_snapshot(repo, local_dir):
990
  """Try to download a snapshot from Hugging Face Hub."""
 
1075
 
1076
  leaderboard = init_leaderboard(
1077
  LEADERBOARD_DF,
1078
+ default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"],
1079
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"]]
1080
  )
1081
 
1082
 
csv_new/llm_scores_p1_final.xlsx ADDED
Binary file (32.1 kB). View file
 
csv_new/llm_scores_p2_final.xlsx ADDED
Binary file (26.9 kB). View file
 
csv_new/llm_scores_p3_final.xlsx ADDED
Binary file (27.8 kB). View file
 
csv_new/output/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.2877 | |0 |
5
+ | - p1 | | | |f1 | | 0.1963 | | 0 |
6
+ | - p2 | | | |f1 | | 0.3459 | | 0 |
7
+ | - p3 | | | |f1 | | 0.3208 | | 0 |
8
+ | - RE | | | |f1 | | 0.4430 | |0 |
9
+ | - p1 | | | |f1 | | 0.4487 | | 0 |
10
+ | - p2 | | | |f1 | | 0.4492 | | 0 |
11
+ | - p3 | | | |f1 | | 0.4311 | | 0 |
12
+ | - RML | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.0000 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0000 | |0 |
21
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/.ipynb_checkpoints/epfl-llm__meditron-7b__gr__0shot-checkpoint.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.2426 | |0 |
5
+ | - p1 | | | |f1 | | 0.2417 | | 0 |
6
+ | - p2 | | | |f1 | | 0.2443 | | 0 |
7
+ | - p3 | | | |f1 | | 0.2417 | | 0 |
8
+ | - RE | | | |f1 | | 0.0592 | |0 |
9
+ | - p1 | | | |f1 | | 0.1556 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0161 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0058 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__en__0shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0918 | |0 |
5
+ | - p1 | | | |f1 | | 0.0629 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1041 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1083 | | 0 |
8
+ | - RE | | | |f1 | | 0.2604 | |0 |
9
+ | - p1 | | | |f1 | | 0.1287 | | 0 |
10
+ | - p2 | | | |f1 | | 0.3394 | | 0 |
11
+ | - p3 | | | |f1 | | 0.3131 | | 0 |
12
+ | - RML | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.0000 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0000 | |0 |
21
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__en__10shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.2142 | |0 |
5
+ | - p1 | | | |f1 | | 0.2189 | | 0 |
6
+ | - p2 | | | |f1 | | 0.2243 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1994 | | 0 |
8
+ | - RE | | | |f1 | | 0.1681 | |0 |
9
+ | - p1 | | | |f1 | | 0.1189 | | 0 |
10
+ | - p2 | | | |f1 | | 0.1668 | | 0 |
11
+ | - p3 | | | |f1 | | 0.2185 | | 0 |
12
+ | - RML | | | |f1 | | 0.1779 | |0 |
13
+ | - p1 | | | |f1 | | 0.1825 | | 0 |
14
+ | - p2 | | | |f1 | | 0.1612 | | 0 |
15
+ | - p3 | | | |f1 | | 0.1900 | | 0 |
16
+ | - DIA | | | |f1 | | 0.1500 | |0 |
17
+ | - p1 | | | |f1 | | 0.2415 | | 0 |
18
+ | - p2 | | | |f1 | | 0.1416 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0668 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0147 | |0 |
21
+ | - p1 | | | |f1 | | 0.0178 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0068 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0194 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__gr__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0611 | |0 |
5
+ | - p1 | | | |f1 | | 0.0620 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0592 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0620 | | 0 |
8
+ | - RE | | | |f1 | | 0.0863 | |0 |
9
+ | - p1 | | | |f1 | | 0.1017 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0506 | | 0 |
11
+ | - p3 | | | |f1 | | 0.1065 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__gr__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.1474 | |0 |
5
+ | - p1 | | | |f1 | | 0.1667 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1089 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1667 | | 0 |
8
+ | - RE | | | |f1 | | 0.0970 | |0 |
9
+ | - p1 | | | |f1 | | 0.0821 | | 0 |
10
+ | - p2 | | | |f1 | | 0.1053 | | 0 |
11
+ | - p3 | | | |f1 | | 0.1036 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__it__0shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0416 | |0 |
5
+ | - p1 | | | |f1 | | 0.0435 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0429 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0384 | | 0 |
8
+ | - RE | | | |f1 | | 0.1413 | |0 |
9
+ | - p1 | | | |f1 | | 0.0672 | | 0 |
10
+ | - p2 | | | |f1 | | 0.2266 | | 0 |
11
+ | - p3 | | | |f1 | | 0.1300 | | 0 |
12
+ | - RML | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.0000 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0000 | |0 |
21
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__it__10shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.3753 | |0 |
5
+ | - p1 | | | |f1 | | 0.3299 | | 0 |
6
+ | - p2 | | | |f1 | | 0.4023 | | 0 |
7
+ | - p3 | | | |f1 | | 0.3938 | | 0 |
8
+ | - RE | | | |f1 | | 0.1331 | |0 |
9
+ | - p1 | | | |f1 | | 0.0977 | | 0 |
10
+ | - p2 | | | |f1 | | 0.1226 | | 0 |
11
+ | - p3 | | | |f1 | | 0.1789 | | 0 |
12
+ | - RML | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.1044 | |0 |
17
+ | - p1 | | | |f1 | | 0.0821 | | 0 |
18
+ | - p2 | | | |f1 | | 0.1119 | | 0 |
19
+ | - p3 | | | |f1 | | 0.1190 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0007 | |0 |
21
+ | - p1 | | | |f1 | | 0.0010 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0002 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0008 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__pl__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0379 | |0 |
5
+ | - p1 | | | |f1 | | 0.0379 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0378 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0379 | | 0 |
8
+ | - RE | | | |f1 | | 0.0891 | |0 |
9
+ | - p1 | | | |f1 | | 0.0602 | | 0 |
10
+ | - p2 | | | |f1 | | 0.1293 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0778 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__pl__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.3966 | |0 |
5
+ | - p1 | | | |f1 | | 0.3992 | | 0 |
6
+ | - p2 | | | |f1 | | 0.3916 | | 0 |
7
+ | - p3 | | | |f1 | | 0.3992 | | 0 |
8
+ | - RE | | | |f1 | | 0.1003 | |0 |
9
+ | - p1 | | | |f1 | | 0.0998 | | 0 |
10
+ | - p2 | | | |f1 | | 0.1055 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0956 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__sk__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0385 | |0 |
5
+ | - p1 | | | |f1 | | 0.0387 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0380 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0387 | | 0 |
8
+ | - RE | | | |f1 | | 0.0174 | |0 |
9
+ | - p1 | | | |f1 | | 0.0121 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0280 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0121 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__sk__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.3507 | |0 |
5
+ | - p1 | | | |f1 | | 0.3444 | | 0 |
6
+ | - p2 | | | |f1 | | 0.3632 | | 0 |
7
+ | - p3 | | | |f1 | | 0.3444 | | 0 |
8
+ | - RE | | | |f1 | | 0.0884 | |0 |
9
+ | - p1 | | | |f1 | | 0.0734 | | 0 |
10
+ | - p2 | | | |f1 | | 0.1045 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0875 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__sl__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0438 | |0 |
5
+ | - p1 | | | |f1 | | 0.0429 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0456 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0429 | | 0 |
8
+ | - RE | | | |f1 | | 0.1278 | |0 |
9
+ | - p1 | | | |f1 | | 0.0967 | | 0 |
10
+ | - p2 | | | |f1 | | 0.1900 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0967 | | 0 |
csv_new/output/Henrychur__MMed-Llama-3-8B__sl__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.3720 | |0 |
5
+ | - p1 | | | |f1 | | 0.3558 | | 0 |
6
+ | - p2 | | | |f1 | | 0.4045 | | 0 |
7
+ | - p3 | | | |f1 | | 0.3558 | | 0 |
8
+ | - RE | | | |f1 | | 0.0762 | |0 |
9
+ | - p1 | | | |f1 | | 0.0787 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0781 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0719 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__en__0shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0578 | |0 |
5
+ | - p1 | | | |f1 | | 0.0940 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0331 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0464 | | 0 |
8
+ | - RE | | | |f1 | | 0.0000 | |0 |
9
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
12
+ | - RML | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.0000 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0000 | |0 |
21
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__en__10shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.1317 | |0 |
5
+ | - p1 | | | |f1 | | 0.1215 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1415 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1322 | | 0 |
8
+ | - RE | | | |f1 | | 0.0031 | |0 |
9
+ | - p1 | | | |f1 | | 0.0028 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0016 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0049 | | 0 |
12
+ | - RML | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.0000 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0000 | |0 |
21
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__gr__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0769 | |0 |
5
+ | - p1 | | | |f1 | | 0.0859 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0591 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0859 | | 0 |
8
+ | - RE | | | |f1 | | 0.0000 | |0 |
9
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__gr__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.1448 | |0 |
5
+ | - p1 | | | |f1 | | 0.1455 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1434 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1455 | | 0 |
8
+ | - RE | | | |f1 | | 0.0010 | |0 |
9
+ | - p1 | | | |f1 | | 0.0024 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0007 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__it__0shot.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0812 | |0 |
5
+ | - p1 | | | |f1 | | 0.0770 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0920 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0747 | | 0 |
8
+ | - RML | | | |f1 | | 0.0000 | |0 |
9
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
12
+ | - DIA | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - HIS | | | |f1 | | 0.0000 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
20
+ | - RE | | | |f1 | | 0.0000 | |0 |
21
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__it__10shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.1694 | |0 |
5
+ | - p1 | | | |f1 | | 0.1616 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1774 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1690 | | 0 |
8
+ | - RE | | | |f1 | | 0.0048 | |0 |
9
+ | - p1 | | | |f1 | | 0.0035 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0064 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0046 | | 0 |
12
+ | - RML | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.0000 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0000 | |0 |
21
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__pl__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0308 | |0 |
5
+ | - p1 | | | |f1 | | 0.0244 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0436 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0244 | | 0 |
8
+ | - RE | | | |f1 | | 0.0000 | |0 |
9
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__pl__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.1516 | |0 |
5
+ | - p1 | | | |f1 | | 0.1500 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1548 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1500 | | 0 |
8
+ | - RE | | | |f1 | | 0.0032 | |0 |
9
+ | - p1 | | | |f1 | | 0.0040 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0023 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0034 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__sk__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0712 | |0 |
5
+ | - p1 | | | |f1 | | 0.0880 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0375 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0880 | | 0 |
8
+ | - RE | | | |f1 | | 0.0000 | |0 |
9
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__sk__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.1444 | |0 |
5
+ | - p1 | | | |f1 | | 0.1485 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1360 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1485 | | 0 |
8
+ | - RE | | | |f1 | | 0.0027 | |0 |
9
+ | - p1 | | | |f1 | | 0.0038 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0024 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0020 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__sl__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0711 | |0 |
5
+ | - p1 | | | |f1 | | 0.0777 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0579 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0777 | | 0 |
8
+ | - RE | | | |f1 | | 0.0000 | |0 |
9
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/HiTZ__Medical-mT5-large__sl__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.1422 | |0 |
5
+ | - p1 | | | |f1 | | 0.1470 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1325 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1470 | | 0 |
8
+ | - RE | | | |f1 | | 0.0080 | |0 |
9
+ | - p1 | | | |f1 | | 0.0073 | | 0 |
10
+ | - p2 | | | |f1 | | 0.0074 | | 0 |
11
+ | - p3 | | | |f1 | | 0.0093 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.2500 | |0 |
5
+ | - p1 | | | |f1 | | 0.3425 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1181 | | 0 |
7
+ | - p3 | | | |f1 | | 0.2893 | | 0 |
8
+ | - RE | | | |f1 | | 0.4075 | |0 |
9
+ | - p1 | | | |f1 | | 0.4135 | | 0 |
10
+ | - p2 | | | |f1 | | 0.3917 | | 0 |
11
+ | - p3 | | | |f1 | | 0.4172 | | 0 |
12
+ | - RML | | | |f1 | | 0.0000 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.0001 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0002 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0000 | |0 |
21
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.5993 | |0 |
5
+ | - p1 | | | |f1 | | 0.6091 | | 0 |
6
+ | - p2 | | | |f1 | | 0.5646 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6243 | | 0 |
8
+ | - RE | | | |f1 | | 0.6164 | |0 |
9
+ | - p1 | | | |f1 | | 0.6332 | | 0 |
10
+ | - p2 | | | |f1 | | 0.6025 | | 0 |
11
+ | - p3 | | | |f1 | | 0.6133 | | 0 |
12
+ | - RML | | | |f1 | | 0.2843 | |0 |
13
+ | - p1 | | | |f1 | | 0.2129 | | 0 |
14
+ | - p2 | | | |f1 | | 0.3222 | | 0 |
15
+ | - p3 | | | |f1 | | 0.3178 | | 0 |
16
+ | - DIA | | | |f1 | | 0.1658 | |0 |
17
+ | - p1 | | | |f1 | | 0.3073 | | 0 |
18
+ | - p2 | | | |f1 | | 0.1137 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0764 | | 0 |
20
+ | - HIS | | | |f1 | | 0.2370 | |0 |
21
+ | - p1 | | | |f1 | | 0.1244 | | 0 |
22
+ | - p2 | | | |f1 | | 0.4429 | | 0 |
23
+ | - p3 | | | |f1 | | 0.1437 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.1290 | |0 |
5
+ | - p1 | | | |f1 | | 0.1339 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1191 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1339 | | 0 |
8
+ | - RE | | | |f1 | | 0.3957 | |0 |
9
+ | - p1 | | | |f1 | | 0.3796 | | 0 |
10
+ | - p2 | | | |f1 | | 0.4266 | | 0 |
11
+ | - p3 | | | |f1 | | 0.3810 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.6028 | |0 |
5
+ | - p1 | | | |f1 | | 0.6119 | | 0 |
6
+ | - p2 | | | |f1 | | 0.5847 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6119 | | 0 |
8
+ | - RE | | | |f1 | | 0.6056 | |0 |
9
+ | - p1 | | | |f1 | | 0.5962 | | 0 |
10
+ | - p2 | | | |f1 | | 0.6024 | | 0 |
11
+ | - p3 | | | |f1 | | 0.6183 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.2137 | |0 |
5
+ | - p1 | | | |f1 | | 0.2467 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1709 | | 0 |
7
+ | - p3 | | | |f1 | | 0.2234 | | 0 |
8
+ | - RE | | | |f1 | | 0.4016 | |0 |
9
+ | - p1 | | | |f1 | | 0.4173 | | 0 |
10
+ | - p2 | | | |f1 | | 0.3770 | | 0 |
11
+ | - p3 | | | |f1 | | 0.4106 | | 0 |
12
+ | - RML | | | |f1 | | 0.0002 | |0 |
13
+ | - p1 | | | |f1 | | 0.0007 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
16
+ | - DIA | | | |f1 | | 0.0000 | |0 |
17
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
18
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
20
+ | - HIS | | | |f1 | | 0.0000 | |0 |
21
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
22
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.6569 | |0 |
5
+ | - p1 | | | |f1 | | 0.6719 | | 0 |
6
+ | - p2 | | | |f1 | | 0.6327 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6661 | | 0 |
8
+ | - RE | | | |f1 | | 0.5952 | |0 |
9
+ | - p1 | | | |f1 | | 0.5767 | | 0 |
10
+ | - p2 | | | |f1 | | 0.5998 | | 0 |
11
+ | - p3 | | | |f1 | | 0.6093 | | 0 |
12
+ | - RML | | | |f1 | | 0.1557 | |0 |
13
+ | - p1 | | | |f1 | | 0.1111 | | 0 |
14
+ | - p2 | | | |f1 | | 0.1599 | | 0 |
15
+ | - p3 | | | |f1 | | 0.1960 | | 0 |
16
+ | - DIA | | | |f1 | | 0.2496 | |0 |
17
+ | - p1 | | | |f1 | | 0.4407 | | 0 |
18
+ | - p2 | | | |f1 | | 0.1328 | | 0 |
19
+ | - p3 | | | |f1 | | 0.1753 | | 0 |
20
+ | - HIS | | | |f1 | | 0.2339 | |0 |
21
+ | - p1 | | | |f1 | | 0.0817 | | 0 |
22
+ | - p2 | | | |f1 | | 0.5103 | | 0 |
23
+ | - p3 | | | |f1 | | 0.1096 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0586 | |0 |
5
+ | - p1 | | | |f1 | | 0.0697 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0364 | | 0 |
7
+ | - p3 | | | |f1 | | 0.0697 | | 0 |
8
+ | - RE | | | |f1 | | 0.4022 | |0 |
9
+ | - p1 | | | |f1 | | 0.3803 | | 0 |
10
+ | - p2 | | | |f1 | | 0.4464 | | 0 |
11
+ | - p3 | | | |f1 | | 0.3800 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.6092 | |0 |
5
+ | - p1 | | | |f1 | | 0.6226 | | 0 |
6
+ | - p2 | | | |f1 | | 0.5824 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6226 | | 0 |
8
+ | - RE | | | |f1 | | 0.5944 | |0 |
9
+ | - p1 | | | |f1 | | 0.5991 | | 0 |
10
+ | - p2 | | | |f1 | | 0.5466 | | 0 |
11
+ | - p3 | | | |f1 | | 0.6375 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.0955 | |0 |
5
+ | - p1 | | | |f1 | | 0.1220 | | 0 |
6
+ | - p2 | | | |f1 | | 0.0426 | | 0 |
7
+ | - p3 | | | |f1 | | 0.1220 | | 0 |
8
+ | - RE | | | |f1 | | 0.4116 | |0 |
9
+ | - p1 | | | |f1 | | 0.4027 | | 0 |
10
+ | - p2 | | | |f1 | | 0.4294 | | 0 |
11
+ | - p3 | | | |f1 | | 0.4027 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.6419 | |0 |
5
+ | - p1 | | | |f1 | | 0.6386 | | 0 |
6
+ | - p2 | | | |f1 | | 0.6486 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6386 | | 0 |
8
+ | - RE | | | |f1 | | 0.5899 | |0 |
9
+ | - p1 | | | |f1 | | 0.5894 | | 0 |
10
+ | - p2 | | | |f1 | | 0.5845 | | 0 |
11
+ | - p3 | | | |f1 | | 0.5959 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.3398 | |0 |
5
+ | - p1 | | | |f1 | | 0.3910 | | 0 |
6
+ | - p2 | | | |f1 | | 0.2375 | | 0 |
7
+ | - p3 | | | |f1 | | 0.3910 | | 0 |
8
+ | - RE | | | |f1 | | 0.3777 | |0 |
9
+ | - p1 | | | |f1 | | 0.3775 | | 0 |
10
+ | - p2 | | | |f1 | | 0.3783 | | 0 |
11
+ | - p3 | | | |f1 | | 0.3775 | | 0 |
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.6371 | |0 |
5
+ | - p1 | | | |f1 | | 0.6467 | | 0 |
6
+ | - p2 | | | |f1 | | 0.6178 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6467 | | 0 |
8
+ | - RE | | | |f1 | | 0.5837 | |0 |
9
+ | - p1 | | | |f1 | | 0.5949 | | 0 |
10
+ | - p2 | | | |f1 | | 0.5782 | | 0 |
11
+ | - p3 | | | |f1 | | 0.5781 | | 0 |
csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.3279 | |0 |
5
+ | - p1 | | | |f1 | | 0.3804 | | 0 |
6
+ | - p2 | | | |f1 | | 0.3068 | | 0 |
7
+ | - p3 | | | |f1 | | 0.2964 | | 0 |
8
+ | - RE | | | |f1 | | 0.4658 | |0 |
9
+ | - p1 | | | |f1 | | 0.4734 | | 0 |
10
+ | - p2 | | | |f1 | | 0.4649 | | 0 |
11
+ | - p3 | | | |f1 | | 0.4591 | | 0 |
12
+ | - RML | | | |f1 | | 0.0015 | |0 |
13
+ | - p1 | | | |f1 | | 0.0005 | | 0 |
14
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p2 | | | |f1 | | 0.0057 | | 0 |
16
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
17
+ | - DIA | | | |f1 | | 0.0002 | |0 |
18
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
20
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
21
+ | - p3 | | | |f1 | | 0.0006 | | 0 |
22
+ | - HIS | | | |f1 | | 0.0000 | |0 |
23
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
24
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
25
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.5895 | |0 |
5
+ | - p1 | | | |f1 | | 0.5970 | | 0 |
6
+ | - p2 | | | |f1 | | 0.5602 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6113 | | 0 |
8
+ | - RE | | | |f1 | | 0.6440 | |0 |
9
+ | - p1 | | | |f1 | | 0.6482 | | 0 |
10
+ | - p2 | | | |f1 | | 0.6469 | | 0 |
11
+ | - p3 | | | |f1 | | 0.6370 | | 0 |
12
+ | - RML | | | |f1 | | 0.0931 | |0 |
13
+ | - p1 | | | |f1 | | 0.1501 | | 0 |
14
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p2 | | | |f1 | | 0.1383 | | 0 |
16
+ | - p3 | | | |f1 | | 0.0839 | | 0 |
17
+ | - DIA | | | |f1 | | 0.0286 | |0 |
18
+ | - p1 | | | |f1 | | 0.0311 | | 0 |
19
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
20
+ | - p3 | | | |f1 | | 0.0546 | | 0 |
21
+ | - HIS | | | |f1 | | 0.0659 | |0 |
22
+ | - p1 | | | |f1 | | 0.0247 | | 0 |
23
+ | - p2 | | | |f1 | | 0.1557 | | 0 |
24
+ | - p3 | | | |f1 | | 0.0174 | | 0 |
csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.4506 | |0 |
5
+ | - p1 | | | |f1 | | 0.5976 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1568 | | 0 |
7
+ | - p3 | | | |f1 | | 0.5976 | | 0 |
8
+ | - RE | | | |f1 | | 0.4104 | |0 |
9
+ | - p1 | | | |f1 | | 0.4393 | | 0 |
10
+ | - p2 | | | |f1 | | 0.4083 | | 0 |
11
+ | - p3 | | | |f1 | | 0.3834 | | 0 |
csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.6175 | |0 |
5
+ | - p1 | | | |f1 | | 0.6196 | | 0 |
6
+ | - p2 | | | |f1 | | 0.6131 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6196 | | 0 |
8
+ | - RE | | | |f1 | | 0.5840 | |0 |
9
+ | - p1 | | | |f1 | | 0.5913 | | 0 |
10
+ | - p2 | | | |f1 | | 0.5896 | | 0 |
11
+ | - p3 | | | |f1 | | 0.5710 | | 0 |
csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.2734 | |0 |
5
+ | - p1 | | | |f1 | | 0.3758 | | 0 |
6
+ | - p2 | | | |f1 | | 0.1647 | | 0 |
7
+ | - p3 | | | |f1 | | 0.2796 | | 0 |
8
+ | - RE | | | |f1 | | 0.4370 | |0 |
9
+ | - p1 | | | |f1 | | 0.4505 | | 0 |
10
+ | - p2 | | | |f1 | | 0.4159 | | 0 |
11
+ | - p3 | | | |f1 | | 0.4447 | | 0 |
12
+ | - RML | | | |f1 | | 0.0004 | |0 |
13
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
14
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
15
+ | - p2 | | | |f1 | | 0.0017 | | 0 |
16
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
17
+ | - DIA | | | |f1 | | 0.0000 | |0 |
18
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
19
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
20
+ | - p3 | | | |f1 | | 0.0000 | | 0 |
21
+ | - HIS | | | |f1 | | 0.0003 | |0 |
22
+ | - p1 | | | |f1 | | 0.0000 | | 0 |
23
+ | - p2 | | | |f1 | | 0.0000 | | 0 |
24
+ | - p3 | | | |f1 | | 0.0008 | | 0 |
csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.7005 | |0 |
5
+ | - p1 | | | |f1 | | 0.6934 | | 0 |
6
+ | - p2 | | | |f1 | | 0.7152 | | 0 |
7
+ | - p3 | | | |f1 | | 0.6930 | | 0 |
8
+ | - RE | | | |f1 | | 0.5641 | |0 |
9
+ | - p1 | | | |f1 | | 0.5801 | | 0 |
10
+ | - p2 | | | |f1 | | 0.5595 | | 0 |
11
+ | - p3 | | | |f1 | | 0.5526 | | 0 |
12
+ | - RML | | | |f1 | | 0.0762 | |0 |
13
+ | - p1 | | | |f1 | | 0.0398 | | 0 |
14
+ | - p2 | | | |f1 | | 0.0599 | | 0 |
15
+ | - p3 | | | |f1 | | 0.1025 | | 0 |
16
+ | - p3 | | | |f1 | | 0.1025 | | 0 |
17
+ | - DIA | | | |f1 | | 0.1086 | |0 |
18
+ | - p1 | | | |f1 | | 0.2322 | | 0 |
19
+ | - p2 | | | |f1 | | 0.0109 | | 0 |
20
+ | - p3 | | | |f1 | | 0.0828 | | 0 |
21
+ | - HIS | | | |f1 | | 0.0353 | |0 |
22
+ | - p1 | | | |f1 | | 0.0186 | | 0 |
23
+ | - p2 | | | |f1 | | 0.0602 | | 0 |
24
+ | - p3 | | | |f1 | | 0.0272 | | 0 |
csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
2
+ |Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
3
+ |-------|-------|------|------|------|----|------|---|------|
4
+ | - NER | | | |f1 | | 0.2428 | |0 |
5
+ | - p1 | | | |f1 | | 0.2486 | | 0 |
6
+ | - p2 | | | |f1 | | 0.2311 | | 0 |
7
+ | - p3 | | | |f1 | | 0.2486 | | 0 |
8
+ | - RE | | | |f1 | | 0.4074 | |0 |
9
+ | - p1 | | | |f1 | | 0.3865 | | 0 |
10
+ | - p2 | | | |f1 | | 0.4569 | | 0 |
11
+ | - p3 | | | |f1 | | 0.3788 | | 0 |