LLM-Disease-Risk-Leaderboard

Runtime error

App Files Files Community

TemryL commited on May 22, 2024

Commit

4c7aa51

1 Parent(s): 225ae73

add nb_shots and make request work

Browse files

Files changed (5) hide show

app.py +16 -1
requirements.txt +3 -1
src/display/utils.py +1 -0
src/leaderboard/read_evals.py +39 -50
src/populate.py +3 -3

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from src.display.utils import (
     Precision,
     generate_column_name
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, HF_TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
@@ -53,7 +54,7 @@ except Exception:
     restart_space()
-raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS)
 leaderboard_df = original_df.copy()
 (
@@ -89,6 +90,7 @@ def select_columns(df: pd.DataFrame, columns: list, phenotypes: list, metrics:li
     always_here_cols = [
         AutoEvalColumn.model_type_symbol.name,
         AutoEvalColumn.model.name,
     ]
     task_cols = []
@@ -197,6 +199,13 @@ with demo:
                             elem_id="search-bar",
                         )
                     with gr.Column(min_width=320):
                         filter_columns_type = gr.CheckboxGroup(
                             label="Model types",
                             choices=[t.to_str() for t in ModelType],
@@ -234,6 +243,12 @@ with demo:
                 interactive=False,
                 visible=True,
             )
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(

     Precision,
     generate_column_name
 )
+from src.display.plot_curves import plot_curves
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, HF_TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
     restart_space()
+results, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS)
 leaderboard_df = original_df.copy()
 (
     always_here_cols = [
         AutoEvalColumn.model_type_symbol.name,
         AutoEvalColumn.model.name,
+        AutoEvalColumn.nb_shots.name,
     ]
     task_cols = []
                             elem_id="search-bar",
                         )
                     with gr.Column(min_width=320):
+                        filter_nb_shots = gr.CheckboxGroup(
+                            label="Number of shots",
+                            choices=["Zero-shot", "10-shot", "All"],
+                            value=["Zero-shot", "10-shot", "All"],
+                            interactive=True,
+                            elem_id="filter-nb-shots",
+                        )
                         filter_columns_type = gr.CheckboxGroup(
                             label="Model types",
                             choices=[t.to_str() for t in ModelType],
                 interactive=False,
                 visible=True,
             )
+            # Plotting the curves
+            # gr.Plot(
+            #     plot_curves(),
+            #     elem_id="plot-curves"
+            # )
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(

requirements.txt CHANGED Viewed

@@ -15,4 +15,6 @@ transformers==4.35.2
 tokenizers>=0.15.0
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
 accelerate==0.24.1
-sentencepiece

 tokenizers>=0.15.0
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
 accelerate==0.24.1
+sentencepiece
+python-dotenv==1.0.1
+plotly==5.22.0

src/display/utils.py CHANGED Viewed

@@ -30,6 +30,7 @@ auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
 auto_eval_column_dict.append(["average_auroc", ColumnContent, ColumnContent("Average AUROC ⬆️", "number", True)])
 auto_eval_column_dict.append(["average_auprc", ColumnContent, ColumnContent("Average AUPRC ⬆️", "number", True)])

 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+auto_eval_column_dict.append(["nb_shots", ColumnContent, ColumnContent("#Shots", "number", True, never_hidden=True)])
 #Scores
 auto_eval_column_dict.append(["average_auroc", ColumnContent, ColumnContent("Average AUROC ⬆️", "number", True)])
 auto_eval_column_dict.append(["average_auprc", ColumnContent, ColumnContent("Average AUPRC ⬆️", "number", True)])

src/leaderboard/read_evals.py CHANGED Viewed

@@ -15,13 +15,14 @@ from src.submission.check_validity import is_model_on_hub
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.
     """
-    eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
     org: str
     model: str
     revision: str # commit hash, "" if main
     results: dict
     raw_data: dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -37,28 +38,26 @@ class EvalResult:
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False, token=os.environ.get("TOKEN")
         )
         architecture = "?"
         if model_config is not None:
@@ -66,52 +65,43 @@ class EvalResult:
             if architectures:
                 architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
         results = {}
         for task in Tasks:
             task = task.value
-            mean = data["results"].get(task.phenotype_key, {}).get("_".join(["mean", task.metric_key]), None)
-            lower = data["results"].get(task.phenotype_key, {}).get("_".join(["lower", task.metric_key]), None)
-            upper = data["results"].get(task.phenotype_key, {}).get("_".join(["upper", task.metric_key]), None)
             formated_score = f"{mean:.2f} ({lower:.2f}-{upper:.2f})" if mean is not None else None
             results["_".join([task.phenotype_key, task.metric_key])] = formated_score
         return self(
-            eval_name=result_key,
             full_model=full_model,
-            org=org,
-            model=model,
             results=results,
             raw_data=data,
             precision=precision,
-            revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
-            architecture=architecture
         )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average_auroc = np.mean(np.array([d["mean_auroc"] for d in self.raw_data["results"].values() if "mean_auroc" in d.keys()]))
-        average_auprc = np.mean(np.array([d["mean_auprc"] for d in self.raw_data["results"].values() if "mean_auprc" in d.keys()]))
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
@@ -154,7 +144,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
     return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
@@ -176,7 +166,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         eval_name = eval_result.eval_name

 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.
     """
+    eval_name: str # org_model_precision_feature-set_nb-shots (uid)
     full_model: str # org/model (path on hub)
     org: str
     model: str
     revision: str # commit hash, "" if main
     results: dict
     raw_data: dict
+    nb_shots: int = 0
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
+        # Get config
         config = data.get("config")
+        full_model = config.get("model")
+        org = full_model.split("/")[0]
+        model = full_model.split("/")[1]
+        precision = Precision.from_str(config.get("precision"))
+        revision = config.get("revision", "")
+        feature_set = config.get("feature_set", "Unknown")
+        nb_shots = config.get("nb_shots", None)
+        model_type = ModelType.from_str(config.get("model_type", ""))
+        weight_type = WeightType[config.get("weight_type", "Original")]
+        license = config.get("license", "?")
+        likes = config.get("likes", 0)
+        num_params = config.get("params", 0)
+        date = config.get("submitted_time", "")
+        # Check if model is still on hub
         still_on_hub, _, model_config = is_model_on_hub(
+            full_model, revision, trust_remote_code=True, test_tokenizer=False, token=os.environ.get("TOKEN")
         )
         architecture = "?"
         if model_config is not None:
             if architectures:
                 architecture = ";".join(architectures)
         results = {}
         for task in Tasks:
             task = task.value
+            mean = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["mean", task.metric_key]), None)
+            lower = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["lower", task.metric_key]), None)
+            upper = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["upper", task.metric_key]), None)
             formated_score = f"{mean:.2f} ({lower:.2f}-{upper:.2f})" if mean is not None else None
             results["_".join([task.phenotype_key, task.metric_key])] = formated_score
         return self(
+            eval_name=f"{org}_{model}_{precision.value.name}_{feature_set}_{nb_shots}",
             full_model=full_model,
+            org=full_model.split("/")[0],
+            model=full_model.split("/")[1],
             results=results,
             raw_data=data,
+            nb_shots=nb_shots,
             precision=precision,
+            revision=revision,
             still_on_hub=still_on_hub,
+            architecture=architecture,
+            model_type=model_type,
+            weight_type=weight_type,
+            license=license,
+            likes=likes,
+            num_params=num_params,
+            date=date
         )
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average_auroc = np.mean(np.array([d["metrics"]["mean_auroc"] for d in self.raw_data["results"].values() if "mean_auroc" in d["metrics"].keys()]))
+        average_auprc = np.mean(np.array([d["metrics"]["mean_auprc"] for d in self.raw_data["results"].values() if "mean_auprc" in d["metrics"].keys()]))
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
+            AutoEvalColumn.nb_shots.name: self.nb_shots,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
     return request_file
+def get_raw_eval_results(results_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         # Store results of same eval together
         eval_name = eval_result.eval_name

src/populate.py CHANGED Viewed

@@ -5,12 +5,12 @@ import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
-    raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)

 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
+from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
+def get_leaderboard_df(results_path: str, cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
+    raw_data = get_raw_eval_results(results_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)