Qpsychometric / df_filterer.py
Fadi12's picture
Qpsychometric Space
59a8a7c
import pandas as pd
from datetime import datetime, timezone
def verify_df_intergrity(df):
"""Verify that the DataFrame has no common elements between columns."""
columns = df.columns.tolist()
for i in range(len(columns)-1):
for j in range(i + 1, len(columns)-1):
if i!=7 and set(df[columns[i]]) & set(df[columns[j]]):
return False
return True
class ModelsData:
class KeyErrorInCache(Exception):
"""Exception raised when a key is not found in the DataFrame cache."""
pass
def __init__(self, df):
"""Initialize the ModelsData object."""
if isinstance(df, pd.DataFrame):
self.df = df.reset_index(drop=True, inplace=False)
else:
raise ValueError("Data must be a pandas DataFrame.")
if not verify_df_intergrity(self.df):
raise ValueError(f"The {self.df.columns.tolist()} must not have any common values.")
def __getitem__(self, key):
"""
Custom indexing to filter by a column value.
Example:
- obj['MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7']['ASI']['QMNLI'] -> avg_mean_score
"""
if key in self.df['model_id'].values:
filtered_df = self.df[self.df['model_id'] == key]
elif key in self.df["model_version_id"].values:
filtered_df = self.df[self.df['model_version_id'] == key]
elif key in self.df['questionnaire_name'].values:
filtered_df = self.df[self.df['questionnaire_name'] == key]
elif key in self.df['questionnaire_task'].values:
filtered_df = self.df[self.df['questionnaire_task'] == key]
else:
raise ModelsData.KeyErrorInCache("These specific parameters are not found in cache.")
filtered_df.reset_index(drop=True, inplace=True)
return ModelsData(filtered_df)
def get_model_version_id(self):
if len(self.df) > 1:
raise ValueError("To get model_version_id the DF must be exactly of length 1, use indexing to filter the desired model and questionnaire and then call `get_model_version_id`")
model_version_id = self.df["model_version_id"].to_list()[0]
return model_version_id
def get_evaluation_results(self):
"""
Expects a DataFrame with columns:
- model_id (all rows must have the same model_id)
- questionnaire_task
- questionnaire_name
- mean_score
Returns a YAML string in the format:
model-index:
- name: <model_id>
results:
- task:
type: <questionnaire_task>
dataset:
name: <questionnaire_name>
type: Qpsychometric
metrics:
- name: Mean-Score
type: Mean-Score
value: <mean_score>
source:
name: Qpsychometric Space
url: https://huggingface.co/spaces/cnai-lab/Qpsychometric
- ...
If the DataFrame contains more than one unique model_id, raises a ValueError.
"""
unique_model_ids = self.df["model_id"].unique()
if len(unique_model_ids) > 1:
raise ValueError(
f"Multiple model_ids found: {unique_model_ids}. "
"Expected only one unique model_id."
)
# Extract the single model_id
model_id = unique_model_ids[0]
lines = []
lines.append("model-index:")
lines.append(f" - name: {model_id}")
lines.append(" results:")
# Build one 'results' entry per row
for _, row in self.df.iterrows():
lines.append(" - task:")
lines.append(f" type: {row['questionnaire_task']}")
lines.append(" dataset:")
lines.append(f" name: {row['questionnaire_name']}")
lines.append(" type: Qpsychometric")
lines.append(" metrics:")
lines.append(" - name: Mean-Score")
lines.append(" type: Mean-Score")
lines.append(f" value: {row['mean_score']}")
lines.append(" source:")
lines.append(" name: Qpsychometric Space")
lines.append(" url: https://huggingface.co/spaces/cnai-lab/Qpsychometric")
return "\n".join(lines)
def get_mean_score(self):
if len(self.df) > 1:
raise ValueError("To get mean score the DF must be exactly of length 1, use indexing to filter the desired model and then call `get_mean_score`")
return {"avg_mean_score" : self.df['mean_score'].to_list()[0]}
def __str__(self):
"""String representation of the DataFrame."""
return self.df.to_string()
def __len__(self):
return len(self.df)