Spaces:
Sleeping
Sleeping
Joschka Strueber
commited on
Commit
·
ce6be70
1
Parent(s):
5d4059c
[Add, Fix] change to CAPA, fix error in dataloading
Browse files- app.py +2 -2
- src/dataloading.py +5 -3
- src/similarity.py +3 -3
app.py
CHANGED
|
@@ -110,7 +110,7 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
|
|
| 110 |
info="Open LLM Leaderboard v2 benchmark datasets"
|
| 111 |
)
|
| 112 |
metric_dropdown = gr.Dropdown(
|
| 113 |
-
choices=["
|
| 114 |
label="Select Metric",
|
| 115 |
info="Select a similarity metric to compute"
|
| 116 |
)
|
|
@@ -158,7 +158,7 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
|
|
| 158 |
- **Models**: Open LLM Leaderboard models \n
|
| 159 |
- Every model evaluation is gated on Hugging Face and access has to be requested. \n
|
| 160 |
- We requested access for the most popular models, but some may be missing. \n
|
| 161 |
-
- **Metrics**:
|
| 162 |
|
| 163 |
if __name__ == "__main__":
|
| 164 |
demo.launch(ssr_mode=False)
|
|
|
|
| 110 |
info="Open LLM Leaderboard v2 benchmark datasets"
|
| 111 |
)
|
| 112 |
metric_dropdown = gr.Dropdown(
|
| 113 |
+
choices=["CAPA", "CAPA (det.)", "Error Consistency"],
|
| 114 |
label="Select Metric",
|
| 115 |
info="Select a similarity metric to compute"
|
| 116 |
)
|
|
|
|
| 158 |
- **Models**: Open LLM Leaderboard models \n
|
| 159 |
- Every model evaluation is gated on Hugging Face and access has to be requested. \n
|
| 160 |
- We requested access for the most popular models, but some may be missing. \n
|
| 161 |
+
- **Metrics**: CAPA (probabilistic), CAPA (deterministic), Error Consistency""")
|
| 162 |
|
| 163 |
if __name__ == "__main__":
|
| 164 |
demo.launch(ssr_mode=False)
|
src/dataloading.py
CHANGED
|
@@ -9,17 +9,19 @@ def get_leaderboard_models():
|
|
| 9 |
api = HfApi()
|
| 10 |
|
| 11 |
# List all datasets in the open-llm-leaderboard organization
|
| 12 |
-
|
| 13 |
|
| 14 |
models = []
|
| 15 |
-
for dataset in
|
| 16 |
if dataset.id.endswith("-details"):
|
| 17 |
dataset_id = dataset.id
|
| 18 |
try:
|
| 19 |
# Check if the dataset can be loaded
|
|
|
|
| 20 |
check_gated = datasets.get_dataset_config_names(dataset_id)
|
|
|
|
| 21 |
# Format: "open-llm-leaderboard/<provider>__<model_name>-details"
|
| 22 |
-
model_part =
|
| 23 |
if "__" in model_part:
|
| 24 |
provider, model = model_part.split("__", 1)
|
| 25 |
models.append(f"{provider}/{model}")
|
|
|
|
| 9 |
api = HfApi()
|
| 10 |
|
| 11 |
# List all datasets in the open-llm-leaderboard organization
|
| 12 |
+
dataset_list = api.list_datasets(author="open-llm-leaderboard")
|
| 13 |
|
| 14 |
models = []
|
| 15 |
+
for dataset in dataset_list:
|
| 16 |
if dataset.id.endswith("-details"):
|
| 17 |
dataset_id = dataset.id
|
| 18 |
try:
|
| 19 |
# Check if the dataset can be loaded
|
| 20 |
+
print(dataset_id)
|
| 21 |
check_gated = datasets.get_dataset_config_names(dataset_id)
|
| 22 |
+
print(check_gated)
|
| 23 |
# Format: "open-llm-leaderboard/<provider>__<model_name>-details"
|
| 24 |
+
model_part = dataset_id.split("/")[-1].replace("-details", "")
|
| 25 |
if "__" in model_part:
|
| 26 |
provider, model = model_part.split("__", 1)
|
| 27 |
models.append(f"{provider}/{model}")
|
src/similarity.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import numpy as np
|
| 2 |
|
| 3 |
-
from lmsim.metrics import Metrics,
|
| 4 |
|
| 5 |
from src.dataloading import load_run_data
|
| 6 |
from src.utils import softmax, one_hot
|
|
@@ -32,9 +32,9 @@ def compute_similarity(metric: Metrics, outputs_a: list[np.array], outputs_b: li
|
|
| 32 |
def compute_pairwise_similarities(metric_name: str, probs: list[list[np.array]], gts: list[list[int]]) -> np.array:
|
| 33 |
# Select chosen metric
|
| 34 |
if metric_name == "Kappa_p (prob.)":
|
| 35 |
-
metric =
|
| 36 |
elif metric_name == "Kappa_p (det.)":
|
| 37 |
-
metric =
|
| 38 |
# Convert probabilities to one-hot
|
| 39 |
probs = [[one_hot(p) for p in model_probs] for model_probs in probs]
|
| 40 |
elif metric_name == "Error Consistency":
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
|
| 3 |
+
from lmsim.metrics import Metrics, CAPA, EC
|
| 4 |
|
| 5 |
from src.dataloading import load_run_data
|
| 6 |
from src.utils import softmax, one_hot
|
|
|
|
| 32 |
def compute_pairwise_similarities(metric_name: str, probs: list[list[np.array]], gts: list[list[int]]) -> np.array:
|
| 33 |
# Select chosen metric
|
| 34 |
if metric_name == "Kappa_p (prob.)":
|
| 35 |
+
metric = CAPA()
|
| 36 |
elif metric_name == "Kappa_p (det.)":
|
| 37 |
+
metric = CAPA(prob=False)
|
| 38 |
# Convert probabilities to one-hot
|
| 39 |
probs = [[one_hot(p) for p in model_probs] for model_probs in probs]
|
| 40 |
elif metric_name == "Error Consistency":
|