Spaces:
No application file
No application file
| """ | |
| Data service provider | |
| """ | |
| import json | |
| from typing import List | |
| import pandas as pd | |
| from app.backend.constant import ModelProvider | |
| from utils.cache_decorator import cache_df_with_custom_key, cache_dict_with_custom_key | |
| from utils.http_utils import get | |
| COLUMNS = ['model_name', 'group_name', 'leaderboard', 'dataset_name', | |
| 'embd_dtype', 'embd_dim', 'num_params', 'max_tokens', 'similarity', | |
| 'query_instruct', 'corpus_instruct', 'ndcg_at_1', 'ndcg_at_3', 'ndcg_at_5', | |
| 'ndcg_at_10', 'ndcg_at_20', | |
| 'ndcg_at_50', 'ndcg_at_100', 'recall_at_1', 'recall_at_3', | |
| 'recall_at_5', 'recall_at_10', 'recall_at_20', 'recall_at_50', | |
| 'recall_at_100', 'precision_at_1', 'precision_at_3', 'precision_at_5', | |
| 'precision_at_10', 'precision_at_20', 'precision_at_50', | |
| 'precision_at_100'] | |
| COLUMNS_TYPES = ["markdown", "str", 'str', 'str', | |
| 'str', 'str', 'number', 'number', 'str', | |
| 'str', 'str', 'number', 'number', 'number', | |
| 'number', 'number', | |
| 'number', 'number', 'number', 'number', | |
| 'number', 'number', 'number', 'number', | |
| 'number', 'number', 'number', 'number', | |
| 'number', 'number', 'number', | |
| 'number'] | |
| GIT_URL = "https://raw.githubusercontent.com/embedding-benchmark/ebr/refs/heads/main/results/" | |
| DATASET_URL = f"{GIT_URL}datasets.json" | |
| MODEL_URL = f"{GIT_URL}models.json" | |
| RESULT_URL = f"{GIT_URL}results.json" | |
| class DataEngine: | |
| def __init__(self): | |
| self.df = self.init_dataframe() | |
| def models(self): | |
| """ | |
| Get models data | |
| """ | |
| res = get(MODEL_URL) | |
| if res.status_code == 200: | |
| return res.json() | |
| return {} | |
| def datasets(self): | |
| """ | |
| Get tasks data | |
| """ | |
| res = get(DATASET_URL) | |
| if res.status_code == 200: | |
| return res.json() | |
| return {} | |
| def results(self): | |
| """ | |
| Get results data | |
| """ | |
| res = get(RESULT_URL) | |
| if res.status_code == 200: | |
| return res.json() | |
| return {} | |
| def init_dataframe(self): | |
| """ | |
| Initialize DataFrame | |
| """ | |
| return self.jsons_to_df() | |
| def get_data(self): | |
| """ | |
| Get the full dataset | |
| """ | |
| df = self.df.copy() | |
| # 移除指定列 | |
| columns_to_remove = ['group_name', 'leaderboard', 'dataset_name'] | |
| df = df.drop(columns=columns_to_remove) | |
| # 按 NDCG@10 降序排序 | |
| return df.sort_values(by='ndcg_at_10', ascending=False) | |
| def get_filtered_data(self, navigation=None, embd_type=None, embd_dims=None, similarity=None): | |
| """ | |
| Get filtered dataset based on criteria | |
| """ | |
| filtered_df = self.df.copy() | |
| if navigation and navigation != "all": | |
| filtered_df = filtered_df[filtered_df['leaderboard'] == navigation] | |
| if embd_type and embd_type != "all": | |
| filtered_df = filtered_df[filtered_df['embd_dtype'] == embd_type] | |
| if similarity and similarity != "all": | |
| filtered_df = filtered_df[filtered_df['similarity'] == similarity] | |
| if embd_dims and isinstance(embd_dims, list) and len(embd_dims) > 0: | |
| filtered_df = filtered_df[filtered_df['embd_dim'].isin(embd_dims)] | |
| # 移除指定列 | |
| columns_to_remove = ['group_name', 'leaderboard', 'dataset_name'] | |
| filtered_df = filtered_df.drop(columns=columns_to_remove) | |
| # 按 NDCG@10 降序排序 | |
| return filtered_df.sort_values(by='ndcg_at_10', ascending=False) | |
| def _check_providers(self, organization: str, providers: List): | |
| if not providers: | |
| return True | |
| if "Others" in providers: | |
| if organization not in ( | |
| ModelProvider.OPENAI.value, ModelProvider.COHERE.value, ModelProvider.VOYAGEAI.value): | |
| return True | |
| return organization in providers | |
| def jsons_to_df(self): | |
| results_list = self.results | |
| df_results_list = [] | |
| for result_dict in results_list: | |
| dataset_name = result_dict["dataset_name"] | |
| df_result_row = pd.DataFrame(result_dict["results"]) | |
| df_result_row["dataset_name"] = dataset_name | |
| df_results_list.append(df_result_row) | |
| df_result = pd.concat(df_results_list) | |
| df_datasets_list = [] | |
| for item in self.datasets: | |
| dataset_names = item["datasets"] | |
| df_dataset_row = pd.DataFrame( | |
| { | |
| "group_name": [item["name"] for _ in range(len(dataset_names))], | |
| "dataset_name": dataset_names, | |
| "leaderboard": [item["leaderboard"] for _ in range(len(dataset_names))] | |
| } | |
| ) | |
| df_datasets_list.append(df_dataset_row) | |
| df_dataset = pd.concat(df_datasets_list).drop_duplicates() | |
| models_list = self.models | |
| df_model = pd.DataFrame(models_list) | |
| df = pd.merge(df_result, df_model, on=["model_name", "embd_dim", "embd_dtype"], how="inner") | |
| df = pd.merge(df, df_dataset, on="dataset_name", how="inner") | |
| df["model_name"] = df.apply(lambda | |
| x: f"""<a target=\"_blank\" style=\"text-decoration: underline\" href=\"{x["reference"]}\">{x["model_name"]}</a>""", | |
| axis=1) | |
| if df.empty: | |
| return pd.DataFrame(columns=COLUMNS) | |
| return df[COLUMNS] | |
| def filter_df(self, df_result: pd.DataFrame, embd_dtype: str, embd_dims: List, similarity: str, max_tokens: int): | |
| """ | |
| filter_by_providers | |
| """ | |
| if not embd_dims: | |
| return df_result[0:0] | |
| if embd_dtype and embd_dtype != "all": | |
| df_result = df_result[df_result['embd_dtype'] == embd_dtype][:] | |
| if similarity and similarity != "all": | |
| df_result = df_result[df_result['similarity'] == similarity][:] | |
| if max_tokens: | |
| df_result = df_result[df_result['max_tokens'] >= max_tokens][:] | |
| if embd_dims: | |
| bins = [0, 1000, 2000, 5000, float('inf')] | |
| labels = ['<=1k', '1k-2k', '2k-5k', '>=5k'] | |
| # 使用 pd.cut 进行分组 | |
| df_result['value_group'] = pd.cut(df_result['embd_dim'], bins=bins, labels=labels, right=False) | |
| df_result = df_result[df_result['value_group'].isin(embd_dims)] | |
| df_result = df_result[COLUMNS] | |
| return df_result | |
| def summarize_dataframe(self): | |
| """ | |
| Summarize data statistics | |
| """ | |