Spaces:

saeedfarzi
/

trail_leaderboard

Sleeping

App Files Files Community

Sfarzi commited on Oct 21, 2025

Commit

f7a50a0

1 Parent(s): 02fbbb9

Initial clone with modifications

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.ipynb_checkpoints/preprocess_models_output-checkpoint.py +264 -0
app.py +37 -49
csv_new/llm_scores_p1_final.xlsx +0 -0
csv_new/llm_scores_p2_final.xlsx +0 -0
csv_new/llm_scores_p3_final.xlsx +0 -0
csv_new/output/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt +23 -0
csv_new/output/.ipynb_checkpoints/epfl-llm__meditron-7b__gr__0shot-checkpoint.txt +11 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__en__0shot.txt +23 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__en__10shot.txt +23 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__gr__0shot.txt +11 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__gr__10shot.txt +11 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__it__0shot.txt +23 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__it__10shot.txt +23 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__pl__0shot.txt +11 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__pl__10shot.txt +11 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__sk__0shot.txt +11 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__sk__10shot.txt +11 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__sl__0shot.txt +11 -0
csv_new/output/Henrychur__MMed-Llama-3-8B__sl__10shot.txt +11 -0
csv_new/output/HiTZ__Medical-mT5-large__en__0shot.txt +23 -0
csv_new/output/HiTZ__Medical-mT5-large__en__10shot.txt +23 -0
csv_new/output/HiTZ__Medical-mT5-large__gr__0shot.txt +11 -0
csv_new/output/HiTZ__Medical-mT5-large__gr__10shot.txt +11 -0
csv_new/output/HiTZ__Medical-mT5-large__it__0shot.txt +22 -0
csv_new/output/HiTZ__Medical-mT5-large__it__10shot.txt +23 -0
csv_new/output/HiTZ__Medical-mT5-large__pl__0shot.txt +11 -0
csv_new/output/HiTZ__Medical-mT5-large__pl__10shot.txt +11 -0
csv_new/output/HiTZ__Medical-mT5-large__sk__0shot.txt +11 -0
csv_new/output/HiTZ__Medical-mT5-large__sk__10shot.txt +11 -0
csv_new/output/HiTZ__Medical-mT5-large__sl__0shot.txt +11 -0
csv_new/output/HiTZ__Medical-mT5-large__sl__10shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt +23 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt +23 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt +23 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt +23 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt +25 -0
csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt +24 -0
csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt +11 -0
csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt +24 -0
csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt +24 -0
csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt +11 -0

.ipynb_checkpoints/preprocess_models_output-checkpoint.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+EVALITA LLM EVALUATION PROCESSOR
+Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.
+DATA PIPELINE OVERVIEW:
+1. Inputs:
+   - Evaluation Results: Raw .out files from lm-eval-harness
+   - Model Metadata: Pre-collected .json files from HuggingFace
+2. Output:
+   - Comprehensive evaluation reports in JSON format
+   - Ready for ingestion into the evaluation leaderboard
+--------------------------------------------------------------------
+INPUT SPECIFICATION
+Evaluation Results (.out format):
+   hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
+   | Task          | Metric | Value  | Stderr |
+   |---------------|--------|--------|--------|
+   | main-task     | acc    | 0.5605 | 0.0052 |
+   | - sub-task    | acc    | 0.4640 | 0.0088 |
+   |   - prompt-1  | acc    | 0.3720 | 0.0216 |
+Model Metadata (.json format):
+   {
+     "model": "model-org/model-name",
+     "base_model": "ModelArchitecture",
+     "revision": "git_commit_hash",
+     "parameters": 8.03,
+     "language": "en_it"
+   }
+--------------------------------------------------------------------
+OUTPUT SPECIFICATION
+Evaluation Report (.json format):
+   {
+     "summary_metrics": {
+       "average_CPS": 41.74,
+       "num_tasks": 12
+     },
+     "model_config": {
+       "identifier": "model-org/model-name",
+       "architecture": "ModelArchitecture",
+       "parameters": 8.03,
+       "evaluation_settings": {
+         "fewshot": 5,
+         "batch_size": 1
+       }
+     },
+     "task_results": {
+       "task-name": {
+         "average_score": 52.60,
+         "best_prompt": {
+           "id": "prompt-6",
+           "score": 66.57
+         },
+         "prompt_analysis": [
+           {
+             "prompt_id": "prompt-1",
+             "score": 37.20,
+             "stderr": 0.0216
+           }
+         ]
+       }
+     }
+   }
+"""
+import json
+import os
+import re
+import statistics
+def safe_float(value):
+    """Safely converts a value to float, returning None if the conversion fails."""
+    try:
+        return float(value)
+    except ValueError:
+        return None
+def calculate_task_metrics(task_info):
+    """Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
+    accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
+    if not accuracies:
+        return None
+    task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
+    task_info['std_accuracy'] = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0
+    best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
+    task_info['best_prompt'] = best_prompt_data['value']
+    task_info['prompt_id'] = best_prompt_data['prompt']
+    # Calculate CPS
+    avg_acc = task_info['average_accuracy']
+    best_acc = task_info['best_prompt']
+    task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
+def extract_data_from_file(file_path):
+    """Extracts task and prompt data from a specified file."""
+    LANG=""
+    if file_path.find ("__en__")!=-1 : LANG="EN"
+    if file_path.find ("__sl__")!=-1 : LANG="SL"
+    if file_path.find ("__it__")!=-1 : LANG="IT"
+    if file_path.find ("__gr__")!=-1 : LANG="GR"
+    if file_path.find ("__sk__")!=-1 : LANG="SK"
+    if file_path.find ("__pl__")!=-1 : LANG="PL"
+    if LANG=="" :
+      print ("ERROR: ",file_path)
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    tasks_data = {}
+    current_task = None
+    for line in lines:
+        line = line.strip()
+        # Skips empty lines
+        if not line:
+            continue
+        # Skips header lines
+        if line.startswith("|         Tasks")  or line.startswith("   | Task"):
+            continue
+        # Extracts model configuration details
+        if line.startswith("hf (pretrained=") or line.startswith("hf(pretrained="):
+            start = line.find("pretrained=") + len("pretrained=")
+            end = line.find(" )", start)
+            pretrained_model = line[start:end]
+            num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
+            num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
+            batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
+            batch_size = int(batch_size_match.group(1)) if batch_size_match else None
+            continue
+        columns = line.split('|')
+        if len(columns) != 11:
+            continue
+        print (columns)
+        task_name = columns[1]
+        metric = columns[5].strip()
+        value = safe_float(columns[7])
+        stderr = safe_float(columns[9])
+        print (value)
+        # Skips normalized accuracy metrics
+        if metric == "acc_norm":
+            continue
+        # Identifies task and prompt sections in the file
+        if task_name.startswith(" - "):
+            task_name = task_name[3:].strip()
+            current_task = task_name
+            tasks_data.setdefault(current_task,
+                                  {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
+                                   'CPS': None})
+        elif task_name.startswith("   - ") and current_task:
+            prompt_name = task_name[4:].strip()
+            prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
+                           'stderr': stderr}
+            tasks_data[current_task]['prompts'].append(prompt_data)
+    # Special handling for evalita NER task to calculate weighted prompt averages
+    if "evalita NER" in tasks_data:
+        task_info = tasks_data["evalita NER"]
+        weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
+                      "WN prompt-1": 2088, "WN prompt-2": 2088}
+        weighted_values = {"prompt-1": 0, "prompt-2": 0}
+        total_weights = sum(weight_map.values())
+        for prompt in task_info['prompts']:
+            if prompt['prompt'] in weight_map:
+                if "prompt-1" in prompt['prompt']:
+                    weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
+                elif "prompt-2" in prompt['prompt']:
+                    weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
+        task_info['prompts'] = [
+            {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
+             'stderr': None},
+            {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
+             'stderr': None}]
+    # Calculates task metrics for each task
+    for task_info in tasks_data.values():
+        calculate_task_metrics(task_info)
+    # Calculates the average CPS across all tasks
+    tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
+    average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
+    config = {
+        "model_name": pretrained_model,
+        "num_fewshot": num_fewshot,
+        "batch_size": batch_size,
+        "LANG": LANG
+    }
+    return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
+"""
+MAIN PROCESSING PIPELINE
+This script executes the complete evaluation data processing workflow:
+1. Input Sources:
+   - Raw evaluation results (.out files) from: ../evalita_llm_models_output/
+   - Model metadata JSON files from: ../evalita_llm_requests/
+2. Processing Steps:
+   - Parses evaluation metrics from .out files
+   - Combines with model metadata
+   - Calculates aggregated performance statistics
+3. Output:
+   - Structured JSON results saved to: ../evalita_llm_results/
+   - Organized by model organization/name
+   - Contains complete evaluation results with metadata
+"""
+directory_in_path = '/home/sfarzi/leaderboard/MediLingua_Leaderboard/csv_files/outputs/'
+directory_in_requests_path = '/home/sfarzi/leaderboard/MediLingua_Leaderboard/e3c_llm_requests/'
+directory_out_results_path = '/home/sfarzi/leaderboard/MediLingua_Leaderboard/e3c_llm_results/'
+for filename in os.listdir(directory_in_path):
+    if filename.endswith('.txt'):
+        file_path = os.path.join(directory_in_path, filename)
+        json_output = extract_data_from_file(file_path)
+        model_org_name, model_name = json_output['config']['model_name'].split('/')
+        config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")
+        if os.path.exists(config_file_path):
+            with open(config_file_path, 'r', encoding='utf-8') as config_file:
+                additional_config = json.load(config_file)
+            json_output['config'].update(additional_config)
+        org_folder_path = os.path.join(directory_out_results_path, model_org_name)
+        os.makedirs(org_folder_path, exist_ok=True)
+        file_suffix = f"{json_output['config']['num_fewshot']}" +"_"+ f"{json_output['config']['LANG']}"
+        output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
+        with open(output_file_path, 'w', newline="\n") as outfile:
+            json.dump(json_output, outfile, indent=4)
+        print(f"File {filename} processed and saved to {output_file_path}")

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ import numpy as np
 # === NEW: helper for prompt sensitivity (simple: only NER/REL and 3 prompts) ===
 def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids):
     """
-    Computes a simple Prompt Sensitivity Index (PSI) over the tasks (NER, REL)
     using the distribution of 'Best Prompt Id' across the provided prompt_ids.
     """
     cv_per_task = []
@@ -47,14 +47,14 @@ def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids):
 def create_best_model_comparison_table(dataframe, lang: str | None = None, shot: str | None = None):
     """
-    Table with best overall model per task (NER, REL) and the model with the best prompt score.
     Applies optional filters:
       - lang in {EN, IT, SL, SK, GR, PL} or None/"All"
       - shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
     """
-    tasks = ["NER", "REL"]
     df = dataframe.copy()
     if lang and lang != "All" and "LANG" in df.columns:
         df = df[df["LANG"] == lang]
     if shot and shot != "All" and "IS_FS" in df.columns:
@@ -66,8 +66,13 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
         if task not in df.columns or df.empty:
             continue
         # Best overall on task
-        max_idx = df[task].idxmax()
-        model_raw = df.loc[max_idx, 'Model']
         if isinstance(model_raw, str) and '<' in model_raw:
             match = re.search(r'>([^<]+)<', model_raw)
             model_name = match.group(1) if match else model_raw
@@ -78,8 +83,11 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
         # Best prompt row for task
         best_prompt_column = f"{task} Best Prompt"
         if best_prompt_column in df.columns and df[best_prompt_column].notna().any():
-            best_prompt_idx = df[best_prompt_column].idxmax()
-            best_prompt_model_raw = df.loc[best_prompt_idx, 'Model']
             if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw:
                 match = re.search(r'>([^<]+)<', best_prompt_model_raw)
                 best_prompt_model = match.group(1) if match else best_prompt_model_raw
@@ -118,7 +126,7 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
     subtitle.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
     fig.update_layout(
-        title={'text': f"Top Model per Task: CPS & Best Prompt (NER/REL) — {', '.join(subtitle)}",
                'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
         font=dict(family="Arial", size=11),
         height=420, margin=dict(l=20, r=20, t=50, b=80)
@@ -130,10 +138,10 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
 # === NEW: Best-model comparison table (only NER, REL) ===
 def create_best_model_comparison_table_without_lang(dataframe):
     """
-    Table with the best overall model per task (NER, REL) and the model that
     achieves the best score with its own best prompt.
     """
-    tasks = ["NER", "REL"]
     table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
     for task in tasks:
@@ -208,7 +216,7 @@ def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None =
       - lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
       - shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
     """
-    tasks = ["NER", "REL"]
     df = dataframe.copy()
     # Language filter
@@ -269,7 +277,7 @@ def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None =
     title_parts.append(lang if (lang and lang != "All") else "All languages")
     title_parts.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
     fig.update_layout(
-        title={'text': f"Most Effective Prompts (NER/REL) — {', '.join(title_parts)}",
                'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
         xaxis_title="Task", yaxis_title="Prompt",
         font=dict(family="Arial", size=11), margin=dict(b=100),
@@ -286,7 +294,7 @@ def create_prompt_heatmap_without_lang(dataframe):
     for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
     ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
     """
-    tasks = ["NER", "REL"]
     # Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts
     all_ids = set()
@@ -381,13 +389,13 @@ def mean_of_max_per_field(df):
     Calcola il massimo per ciascun campo e poi la media dei massimi.
     Args:
-        df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL
     Returns:
         float: media dei valori massimi dei campi
     """
     #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
-    fields = ["NER", "REL"]
     #print(df.columns)
     # Controlla che tutte le colonne esistano nel DataFrame
@@ -396,7 +404,7 @@ def mean_of_max_per_field(df):
         raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
     # Calcola il massimo per ciascun campo
-    max_values = df[fields].max()
     # Calcola la media dei massimi
     mean_max = max_values.mean()
@@ -406,7 +414,7 @@ def mean_of_max_per_field(df):
 def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
     if tasks is None:
-        tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
     task_means = {}
@@ -481,7 +489,7 @@ def boxplot_per_task(dataframe=None, baselines=None, references=None):
     #print(dataframe.columns)
     #tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
-    tasks =["NER", "REL"]
     if dataframe is None:
         np.random.seed(42)
         dataframe = pd.DataFrame({
@@ -799,10 +807,12 @@ TASK_METADATA_MULTIPLECHOICE = {
 # Define task metadata (icons, names, descriptions)
 TASK_METADATA_GENERATIVE = {
-    #"LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
-    #"SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
     "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
     "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
 }
 def restart_space():
@@ -895,8 +905,10 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
     """
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
-    sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
     # aggiungo la colonna rank in base alla posizione
     sorted_dataframe = sorted_dataframe.reset_index(drop=True)
@@ -972,31 +984,7 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
         interactive=False
     )
-'''
-# Helper function for leaderboard initialization
-def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
-    """Initialize and return a leaderboard."""
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
-            ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-'''
 def download_snapshot(repo, local_dir):
     """Try to download a snapshot from Hugging Face Hub."""
@@ -1087,8 +1075,8 @@ with demo:
             leaderboard = init_leaderboard(
                 LEADERBOARD_DF,
-                default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
-                hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
             )

 # === NEW: helper for prompt sensitivity (simple: only NER/REL and 3 prompts) ===
 def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids):
     """
+    Computes a simple Prompt Sensitivity Index (PSI) over the tasks
     using the distribution of 'Best Prompt Id' across the provided prompt_ids.
     """
     cv_per_task = []
 def create_best_model_comparison_table(dataframe, lang: str | None = None, shot: str | None = None):
     """
+    Table with best overall model per task  and the model with the best prompt score.
     Applies optional filters:
       - lang in {EN, IT, SL, SK, GR, PL} or None/"All"
       - shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
     """
+    tasks = ["NER", "REL", "RML", "HIS", "DIA"]
     df = dataframe.copy()
     if lang and lang != "All" and "LANG" in df.columns:
         df = df[df["LANG"] == lang]
     if shot and shot != "All" and "IS_FS" in df.columns:
         if task not in df.columns or df.empty:
             continue
         # Best overall on task
+        #max_idx = df[task].idxmax()
+        max_idx = pd.to_numeric(df[task], errors='coerce').idxmax()
+        try:
+          model_raw = df.loc[max_idx, 'Model']
+        except Exception as e:
+          break
         if isinstance(model_raw, str) and '<' in model_raw:
             match = re.search(r'>([^<]+)<', model_raw)
             model_name = match.group(1) if match else model_raw
         # Best prompt row for task
         best_prompt_column = f"{task} Best Prompt"
         if best_prompt_column in df.columns and df[best_prompt_column].notna().any():
+            best_prompt_idx= pd.to_numeric(df[best_prompt_column],errors='coerce').idxmax()
+            try:
+             best_prompt_model_raw = df.loc[best_prompt_idx, 'Model']
+            except Exception as e:
+             break
             if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw:
                 match = re.search(r'>([^<]+)<', best_prompt_model_raw)
                 best_prompt_model = match.group(1) if match else best_prompt_model_raw
     subtitle.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
     fig.update_layout(
+        title={'text': f"Top Model per Task: CPS & Best Prompt  — {', '.join(subtitle)}",
                'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
         font=dict(family="Arial", size=11),
         height=420, margin=dict(l=20, r=20, t=50, b=80)
 # === NEW: Best-model comparison table (only NER, REL) ===
 def create_best_model_comparison_table_without_lang(dataframe):
     """
+    Table with the best overall model per task (NER, REL,) and the model that
     achieves the best score with its own best prompt.
     """
+    tasks = ["NER", "REL", "RML", "HIS", "DIA"]
     table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
     for task in tasks:
       - lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
       - shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
     """
+    tasks = ["NER", "REL", "RML", "HIS", "DIA"]
     df = dataframe.copy()
     # Language filter
     title_parts.append(lang if (lang and lang != "All") else "All languages")
     title_parts.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
     fig.update_layout(
+        title={'text': f"Most Effective Prompts  — {', '.join(title_parts)}",
                'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
         xaxis_title="Task", yaxis_title="Prompt",
         font=dict(family="Arial", size=11), margin=dict(b=100),
     for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
     ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
     """
+    tasks = ["NER", "REL", "RML", "HIS", "DIA"]
     # Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts
     all_ids = set()
     Calcola il massimo per ciascun campo e poi la media dei massimi.
     Args:
+        df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL, RML, DIA, HIS
     Returns:
         float: media dei valori massimi dei campi
     """
     #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
+    fields = ["NER", "REL", "RML", "DIA", "HIS"]
     #print(df.columns)
     # Controlla che tutte le colonne esistano nel DataFrame
         raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
     # Calcola il massimo per ciascun campo
+    max_values = df[fields].apply(pd.to_numeric, errors='coerce').max(skipna=True)
     # Calcola la media dei massimi
     mean_max = max_values.mean()
 def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
     if tasks is None:
+        tasks = [ "NER", "REL", "RML", "DIA", "HIS"]
     task_means = {}
     #print(dataframe.columns)
     #tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
+    tasks =["NER", "REL", "RML", "HIS", "DIA"]
     if dataframe is None:
         np.random.seed(42)
         dataframe = pd.DataFrame({
 # Define task metadata (icons, names, descriptions)
 TASK_METADATA_GENERATIVE = {
     "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
     "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
+    "RML": {"icon": "😃", "name": "CRF RML", "tooltip": "CRF RML"},
+    "DIA": {"icon": "🏥", "name": "CRF Diagnosis", "tooltip": "CRF Diagnosis"},
+    "HIS": {"icon": "📝", "name": "CRF History", "tooltip": "CRF History"},
 }
 def restart_space():
     """
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
+    #sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
+    clean_df = dataframe.assign( **{"Combined Performance": pd.to_numeric(dataframe["Combined Performance"], errors="coerce")}).loc[lambda df: df["Combined Performance"].notna() & (df["Combined Performance"] != 0)]
+    sorted_dataframe = clean_df.sort_values(by="Combined Performance", ascending=False)
     # aggiungo la colonna rank in base alla posizione
     sorted_dataframe = sorted_dataframe.reset_index(drop=True)
         interactive=False
     )
 def download_snapshot(repo, local_dir):
     """Try to download a snapshot from Hugging Face Hub."""
             leaderboard = init_leaderboard(
                 LEADERBOARD_DF,
+                default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"],
+                hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"]]
             )

csv_new/llm_scores_p1_final.xlsx ADDED Viewed

Binary file (32.1 kB). View file

csv_new/llm_scores_p2_final.xlsx ADDED Viewed

Binary file (26.9 kB). View file

csv_new/llm_scores_p3_final.xlsx ADDED Viewed

Binary file (27.8 kB). View file

csv_new/output/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.2877 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1963 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3459 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3208 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4430 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4487 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4492 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4311 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/.ipynb_checkpoints/epfl-llm__meditron-7b__gr__0shot-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.2426 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2417 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2443 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2417 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0592 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1556 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0161 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0058 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__en__0shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0918 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0629 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1041 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1083 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.2604 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1287 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3394 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3131 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__en__10shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.2142 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2189 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2243 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1994 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.1681 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1189 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1668 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2185 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.1779 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1825 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1612 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1900 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.1500 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2415 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1416 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0668 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0147 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0178 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0068 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0194 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__gr__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0611 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0620 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0592 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0620 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0863 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1017 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0506 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1065 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__gr__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.1474 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1667 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1089 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1667 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0970 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0821 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1053 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1036 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__it__0shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0416 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0435 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0429 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0384 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.1413 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0672 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2266 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1300 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__it__10shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.3753 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3299 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4023 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3938 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.1331 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0977 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1226 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1789 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.1044 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0821 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1119 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1190 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0007 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0010 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0002 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0008 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__pl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0379 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0379 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0378 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0379 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0891 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0602 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1293 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0778 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__pl__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.3966 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3992 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3916 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3992 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.1003 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0998 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1055 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0956 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__sk__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0385 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0387 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0380 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0387 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0174 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0121 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0280 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0121 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__sk__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.3507 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3444 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3632 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3444 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0884 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0734 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1045 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0875 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__sl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0438 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0429 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0456 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0429 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.1278 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0967 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1900 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0967 |   | 0 |

csv_new/output/Henrychur__MMed-Llama-3-8B__sl__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.3720 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3558 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4045 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3558 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0762 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0787 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0781 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0719 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__en__0shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0578 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0940 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0331 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0464 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__en__10shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.1317 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1215 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1415 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1322 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0031 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0028 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0016 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0049 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__gr__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0769 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0859 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0591 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0859 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__gr__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.1448 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1455 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1434 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1455 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0010 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0024 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0007 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__it__0shot.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0812 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0770 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0920 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0747 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__it__10shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.1694 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1616 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1774 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1690 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0048 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0035 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0064 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0046 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__pl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0308 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0244 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0436 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0244 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__pl__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.1516 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1500 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1548 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1500 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0032 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0040 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0023 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0034 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__sk__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0712 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0880 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0375 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0880 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__sk__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.1444 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1485 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1360 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1485 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0027 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0038 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0024 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0020 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__sl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0711 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0777 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0579 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0777 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/HiTZ__Medical-mT5-large__sl__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.1422 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1470 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1325 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1470 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.0080 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0073 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0074 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0093 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.2500 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3425 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1181 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2893 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4075 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4135 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3917 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4172 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0001 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0002 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.5993 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6091 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5646 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6243 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.6164 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6332 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6025 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6133 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.2843 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2129 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3222 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3178 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.1658 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3073 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1137 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0764 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.2370 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1244 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4429 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1437 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.1290 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1339 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1191 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1339 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.3957 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3796 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4266 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3810 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.6028 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6119 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5847 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6119 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.6056 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5962 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6024 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6183 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.2137 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2467 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1709 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2234 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4016 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4173 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3770 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4106 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0002 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0007 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.6569 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6719 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6327 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6661 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.5952 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5767 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5998 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6093 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.1557 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1111 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1599 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1960 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.2496 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4407 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1328 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1753 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.2339 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0817 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5103 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1096 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0586 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0697 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0364 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0697 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4022 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3803 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4464 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3800 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.6092 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6226 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5824 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6226 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.5944 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5991 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5466 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6375 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.0955 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1220 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0426 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1220 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4116 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4027 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4294 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4027 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.6419 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6386 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6486 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6386 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.5899 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5894 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5845 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.5959 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.3398 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3910 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2375 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3910 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.3777 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3775 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3783 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3775 |   | 0 |

csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.6371 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6467 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6178 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6467 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.5837 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5949 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5782 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.5781 |   | 0 |

csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.3279 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3804 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3068 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2964 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4658 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4734 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4649 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4591 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0015 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0005 |   | 0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0057 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0002 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0006 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.5895 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5970 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5602 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6113 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.6440 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6482 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6469 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6370 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0931 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1501 |   | 0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1383 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0839 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0286 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0311 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0546 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0659 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0247 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1557 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0174 |   | 0 |

csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.4506 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5976 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1568 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.5976 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4104 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4393 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4083 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3834 |   | 0 |

csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.6175 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6196 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6131 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6196 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.5840 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5913 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5896 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.5710 |   | 0 |

csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.2734 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3758 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1647 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2796 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4370 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4505 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4159 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4447 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0004 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0017 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0003 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0008 |   | 0 |

csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.7005 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6934 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.7152 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6930 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.5641 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5801 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5595 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.5526 |   | 0 |
+| - RML        |       |      |      |f1    |   | 0.0762 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0398 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0599 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1025 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1025 |   | 0 |
+| - DIA        |       |      |      |f1    |   | 0.1086 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2322 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0109 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0828 |   | 0 |
+| - HIS        |       |      |      |f1    |   | 0.0353 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0186 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0602 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0272 |   | 0 |

csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - NER        |       |      |      |f1    |   | 0.2428 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2486 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2311 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2486 |   | 0 |
+| - RE        |       |      |      |f1    |   | 0.4074 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3865 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4569 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3788 |   | 0 |