Spaces:
Sleeping
Sleeping
Commit
·
7b849c4
1
Parent(s):
1db0fcf
Try Visualization
Browse files
app.py
CHANGED
|
@@ -52,10 +52,12 @@ class RelativeScaler(BaseEstimator, TransformerMixin):
|
|
| 52 |
|
| 53 |
return np.hstack(transformed)
|
| 54 |
|
|
|
|
| 55 |
N_COMPONENTS = 3
|
| 56 |
TSNE_NEIGHBOURS = 15
|
| 57 |
# WEIGHT_FACTOR = 0.05
|
| 58 |
|
|
|
|
| 59 |
TOOLTIPS = """
|
| 60 |
<div>
|
| 61 |
<div>
|
|
@@ -70,6 +72,7 @@ TOOLTIPS = """
|
|
| 70 |
</div>
|
| 71 |
"""
|
| 72 |
|
|
|
|
| 73 |
def config_style():
|
| 74 |
# st.set_page_config(layout="wide")
|
| 75 |
st.markdown("""
|
|
@@ -85,6 +88,7 @@ def config_style():
|
|
| 85 |
""", unsafe_allow_html=True)
|
| 86 |
st.markdown('<h1 class="main-title">Merit Embeddings 🎒📃🏆</h1>', unsafe_allow_html=True)
|
| 87 |
|
|
|
|
| 88 |
def load_embeddings(model, version, embedding_prefix, weight_factor):
|
| 89 |
if model == "Donut":
|
| 90 |
df_real = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_secret_all_{weight_factor}embeddings.csv")
|
|
@@ -275,6 +279,7 @@ def load_embeddings(model, version, embedding_prefix, weight_factor):
|
|
| 275 |
st.error("Modelo no reconocido")
|
| 276 |
return None
|
| 277 |
|
|
|
|
| 278 |
def split_versions(df_combined, reduced):
|
| 279 |
# Asignar las coordenadas si la reducción es 2D
|
| 280 |
if reduced.shape[1] == 2:
|
|
@@ -294,6 +299,7 @@ def split_versions(df_combined, reduced):
|
|
| 294 |
unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
|
| 295 |
return df_dict, unique_subsets
|
| 296 |
|
|
|
|
| 297 |
def get_embedding_from_df(df):
|
| 298 |
# Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
|
| 299 |
if 'embedding' in df.columns:
|
|
@@ -303,6 +309,7 @@ def get_embedding_from_df(df):
|
|
| 303 |
else:
|
| 304 |
raise ValueError("No se encontró embedding o coordenadas x,y en el DataFrame.")
|
| 305 |
|
|
|
|
| 306 |
def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein", bins=20):
|
| 307 |
if metric.lower() == "wasserstein":
|
| 308 |
n = synthetic_points.shape[0]
|
|
@@ -334,6 +341,7 @@ def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein"
|
|
| 334 |
else:
|
| 335 |
raise ValueError("Métrica desconocida. Usa 'wasserstein', 'euclidean' o 'kl'.")
|
| 336 |
|
|
|
|
| 337 |
def compute_cluster_distances_synthetic_individual(synthetic_df: pd.DataFrame, df_real: pd.DataFrame, real_labels: list, metric="wasserstein", bins=20) -> pd.DataFrame:
|
| 338 |
distances = {}
|
| 339 |
groups = synthetic_df.groupby(['source', 'label'])
|
|
@@ -355,6 +363,7 @@ def compute_cluster_distances_synthetic_individual(synthetic_df: pd.DataFrame, d
|
|
| 355 |
distances[key][real_label] = d
|
| 356 |
return pd.DataFrame(distances).T
|
| 357 |
|
|
|
|
| 358 |
def compute_continuity(X, X_embedded, n_neighbors=5):
|
| 359 |
n = X.shape[0]
|
| 360 |
D_high = pairwise_distances(X, metric='euclidean')
|
|
@@ -375,6 +384,7 @@ def compute_continuity(X, X_embedded, n_neighbors=5):
|
|
| 375 |
continuity_value = 1 - norm * total
|
| 376 |
return continuity_value
|
| 377 |
|
|
|
|
| 378 |
def create_table(df_distances):
|
| 379 |
df_table = df_distances.copy()
|
| 380 |
df_table.reset_index(inplace=True)
|
|
@@ -438,6 +448,7 @@ def create_figure(dfs, unique_subsets, color_maps, model_name):
|
|
| 438 |
fig.legend.visible = show_legend
|
| 439 |
return fig, real_renderers, synthetic_renderers, pretrained_renderers
|
| 440 |
|
|
|
|
| 441 |
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
|
| 442 |
renderers = {}
|
| 443 |
for label in selected_labels:
|
|
@@ -467,6 +478,7 @@ def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_la
|
|
| 467 |
renderers[label + f" ({group_label})"] = r
|
| 468 |
return renderers
|
| 469 |
|
|
|
|
| 470 |
def add_synthetic_dataset_to_fig(fig, df, labels, marker, color_mapping, group_label):
|
| 471 |
renderers = {}
|
| 472 |
for label in labels:
|
|
@@ -516,6 +528,7 @@ def add_synthetic_dataset_to_fig(fig, df, labels, marker, color_mapping, group_l
|
|
| 516 |
renderers[label + f" ({group_label})"] = r
|
| 517 |
return renderers
|
| 518 |
|
|
|
|
| 519 |
def get_color_maps(unique_subsets, result):
|
| 520 |
color_map = {}
|
| 521 |
num_real = len(unique_subsets["real"])
|
|
@@ -548,6 +561,7 @@ def get_color_maps(unique_subsets, result):
|
|
| 548 |
|
| 549 |
return color_map
|
| 550 |
|
|
|
|
| 551 |
def calculate_cluster_centers(df, labels):
|
| 552 |
centers = {}
|
| 553 |
for label in labels:
|
|
@@ -556,22 +570,27 @@ def calculate_cluster_centers(df, labels):
|
|
| 556 |
centers[label] = (subset['x'].mean(), subset['y'].mean())
|
| 557 |
return centers
|
| 558 |
|
|
|
|
| 559 |
def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
|
| 560 |
if reduction_method == "PCA":
|
| 561 |
reducer = Pipeline([
|
| 562 |
("pca", PCA(n_components=N_COMPONENTS)),
|
| 563 |
("rel_scaler", RelativeScaler())
|
| 564 |
])
|
|
|
|
| 565 |
elif reduction_method == "UMAP":
|
| 566 |
reducer = umap.UMAP(n_components=N_COMPONENTS,
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
|
|
|
|
|
|
| 571 |
else:
|
| 572 |
reducer = TSNE(n_components=2, random_state=42,
|
| 573 |
-
|
| 574 |
-
|
|
|
|
| 575 |
|
| 576 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
| 577 |
# reduced = reducer.fit_transform(df_combined[df_combined["version"] == "real"][embedding_cols].values)
|
|
@@ -874,7 +893,6 @@ def run_model(model_name):
|
|
| 874 |
tsne_params = {"perplexity": perplexity_val, "learning_rate": learning_rate_val}
|
| 875 |
|
| 876 |
result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method=reduction_method, distance_metric=distance_metric.lower())
|
| 877 |
-
print(result)
|
| 878 |
|
| 879 |
reg_metrics = pd.DataFrame({
|
| 880 |
"Slope": [result["slope"]],
|
|
@@ -1033,24 +1051,32 @@ def run_model(model_name):
|
|
| 1033 |
key=f"download_button_excel_{model_name}"
|
| 1034 |
)
|
| 1035 |
|
| 1036 |
-
if reduction_method
|
| 1037 |
st.markdown("## PCA - Solo Muestras Reales")
|
| 1038 |
# -------------------------------------------------------------------------
|
| 1039 |
# 1. PCA sobre las muestras reales
|
| 1040 |
df_real_only = embeddings["real"].copy()
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
|
| 1046 |
reduced_real = reducer_real.fit_transform(df_real_only[embedding_cols].values)
|
| 1047 |
|
| 1048 |
-
|
| 1049 |
# Agregar columnas PC1, PC2, … a df_real_only
|
| 1050 |
for i in range(reduced_real.shape[1]):
|
| 1051 |
df_real_only[f'PC{i+1}'] = reduced_real[:, i]
|
| 1052 |
|
| 1053 |
-
explained_variance_real = reducer_real.named_steps["pca"].explained_variance_ratio_
|
| 1054 |
unique_labels_real = sorted(df_real_only['label'].unique().tolist())
|
| 1055 |
|
| 1056 |
# Mapeo de colores para las muestras reales usando la paleta Reds9
|
|
@@ -1062,13 +1088,15 @@ def run_model(model_name):
|
|
| 1062 |
real_color_mapping = {label: red_palette[i] for i, label in enumerate(unique_labels_real)}
|
| 1063 |
|
| 1064 |
# Mostrar tabla de Explained Variance Ratio
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
"
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
|
|
|
|
|
|
| 1072 |
|
| 1073 |
# Mostrar los plots de loadings para cada componente
|
| 1074 |
# st.subheader("PCA - Real: Component Loadings")
|
|
|
|
| 52 |
|
| 53 |
return np.hstack(transformed)
|
| 54 |
|
| 55 |
+
|
| 56 |
N_COMPONENTS = 3
|
| 57 |
TSNE_NEIGHBOURS = 15
|
| 58 |
# WEIGHT_FACTOR = 0.05
|
| 59 |
|
| 60 |
+
|
| 61 |
TOOLTIPS = """
|
| 62 |
<div>
|
| 63 |
<div>
|
|
|
|
| 72 |
</div>
|
| 73 |
"""
|
| 74 |
|
| 75 |
+
|
| 76 |
def config_style():
|
| 77 |
# st.set_page_config(layout="wide")
|
| 78 |
st.markdown("""
|
|
|
|
| 88 |
""", unsafe_allow_html=True)
|
| 89 |
st.markdown('<h1 class="main-title">Merit Embeddings 🎒📃🏆</h1>', unsafe_allow_html=True)
|
| 90 |
|
| 91 |
+
|
| 92 |
def load_embeddings(model, version, embedding_prefix, weight_factor):
|
| 93 |
if model == "Donut":
|
| 94 |
df_real = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_secret_all_{weight_factor}embeddings.csv")
|
|
|
|
| 279 |
st.error("Modelo no reconocido")
|
| 280 |
return None
|
| 281 |
|
| 282 |
+
|
| 283 |
def split_versions(df_combined, reduced):
|
| 284 |
# Asignar las coordenadas si la reducción es 2D
|
| 285 |
if reduced.shape[1] == 2:
|
|
|
|
| 299 |
unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
|
| 300 |
return df_dict, unique_subsets
|
| 301 |
|
| 302 |
+
|
| 303 |
def get_embedding_from_df(df):
|
| 304 |
# Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
|
| 305 |
if 'embedding' in df.columns:
|
|
|
|
| 309 |
else:
|
| 310 |
raise ValueError("No se encontró embedding o coordenadas x,y en el DataFrame.")
|
| 311 |
|
| 312 |
+
|
| 313 |
def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein", bins=20):
|
| 314 |
if metric.lower() == "wasserstein":
|
| 315 |
n = synthetic_points.shape[0]
|
|
|
|
| 341 |
else:
|
| 342 |
raise ValueError("Métrica desconocida. Usa 'wasserstein', 'euclidean' o 'kl'.")
|
| 343 |
|
| 344 |
+
|
| 345 |
def compute_cluster_distances_synthetic_individual(synthetic_df: pd.DataFrame, df_real: pd.DataFrame, real_labels: list, metric="wasserstein", bins=20) -> pd.DataFrame:
|
| 346 |
distances = {}
|
| 347 |
groups = synthetic_df.groupby(['source', 'label'])
|
|
|
|
| 363 |
distances[key][real_label] = d
|
| 364 |
return pd.DataFrame(distances).T
|
| 365 |
|
| 366 |
+
|
| 367 |
def compute_continuity(X, X_embedded, n_neighbors=5):
|
| 368 |
n = X.shape[0]
|
| 369 |
D_high = pairwise_distances(X, metric='euclidean')
|
|
|
|
| 384 |
continuity_value = 1 - norm * total
|
| 385 |
return continuity_value
|
| 386 |
|
| 387 |
+
|
| 388 |
def create_table(df_distances):
|
| 389 |
df_table = df_distances.copy()
|
| 390 |
df_table.reset_index(inplace=True)
|
|
|
|
| 448 |
fig.legend.visible = show_legend
|
| 449 |
return fig, real_renderers, synthetic_renderers, pretrained_renderers
|
| 450 |
|
| 451 |
+
|
| 452 |
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
|
| 453 |
renderers = {}
|
| 454 |
for label in selected_labels:
|
|
|
|
| 478 |
renderers[label + f" ({group_label})"] = r
|
| 479 |
return renderers
|
| 480 |
|
| 481 |
+
|
| 482 |
def add_synthetic_dataset_to_fig(fig, df, labels, marker, color_mapping, group_label):
|
| 483 |
renderers = {}
|
| 484 |
for label in labels:
|
|
|
|
| 528 |
renderers[label + f" ({group_label})"] = r
|
| 529 |
return renderers
|
| 530 |
|
| 531 |
+
|
| 532 |
def get_color_maps(unique_subsets, result):
|
| 533 |
color_map = {}
|
| 534 |
num_real = len(unique_subsets["real"])
|
|
|
|
| 561 |
|
| 562 |
return color_map
|
| 563 |
|
| 564 |
+
|
| 565 |
def calculate_cluster_centers(df, labels):
|
| 566 |
centers = {}
|
| 567 |
for label in labels:
|
|
|
|
| 570 |
centers[label] = (subset['x'].mean(), subset['y'].mean())
|
| 571 |
return centers
|
| 572 |
|
| 573 |
+
|
| 574 |
def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
|
| 575 |
if reduction_method == "PCA":
|
| 576 |
reducer = Pipeline([
|
| 577 |
("pca", PCA(n_components=N_COMPONENTS)),
|
| 578 |
("rel_scaler", RelativeScaler())
|
| 579 |
])
|
| 580 |
+
|
| 581 |
elif reduction_method == "UMAP":
|
| 582 |
reducer = umap.UMAP(n_components=N_COMPONENTS,
|
| 583 |
+
random_state=42,
|
| 584 |
+
n_neighbors=15,
|
| 585 |
+
min_dist=0.1,
|
| 586 |
+
metric='cosine'
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
else:
|
| 590 |
reducer = TSNE(n_components=2, random_state=42,
|
| 591 |
+
perplexity=tsne_params["perplexity"],
|
| 592 |
+
learning_rate=tsne_params["learning_rate"]
|
| 593 |
+
)
|
| 594 |
|
| 595 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
| 596 |
# reduced = reducer.fit_transform(df_combined[df_combined["version"] == "real"][embedding_cols].values)
|
|
|
|
| 893 |
tsne_params = {"perplexity": perplexity_val, "learning_rate": learning_rate_val}
|
| 894 |
|
| 895 |
result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method=reduction_method, distance_metric=distance_metric.lower())
|
|
|
|
| 896 |
|
| 897 |
reg_metrics = pd.DataFrame({
|
| 898 |
"Slope": [result["slope"]],
|
|
|
|
| 1051 |
key=f"download_button_excel_{model_name}"
|
| 1052 |
)
|
| 1053 |
|
| 1054 |
+
if reduction_method in ("PCA", "UMAP"):
|
| 1055 |
st.markdown("## PCA - Solo Muestras Reales")
|
| 1056 |
# -------------------------------------------------------------------------
|
| 1057 |
# 1. PCA sobre las muestras reales
|
| 1058 |
df_real_only = embeddings["real"].copy()
|
| 1059 |
+
|
| 1060 |
+
if reduction_method == "PCA":
|
| 1061 |
+
reducer_real = Pipeline([
|
| 1062 |
+
("pca", PCA(n_components=N_COMPONENTS)),
|
| 1063 |
+
("rel_scaler", RelativeScaler())
|
| 1064 |
+
])
|
| 1065 |
+
|
| 1066 |
+
elif reduction_method == "UMAP":
|
| 1067 |
+
reducer_real = umap.UMAP(n_components=N_COMPONENTS,
|
| 1068 |
+
random_state=42,
|
| 1069 |
+
n_neighbors=15,
|
| 1070 |
+
min_dist=0.1,
|
| 1071 |
+
metric='cosine'
|
| 1072 |
+
)
|
| 1073 |
|
| 1074 |
reduced_real = reducer_real.fit_transform(df_real_only[embedding_cols].values)
|
| 1075 |
|
|
|
|
| 1076 |
# Agregar columnas PC1, PC2, … a df_real_only
|
| 1077 |
for i in range(reduced_real.shape[1]):
|
| 1078 |
df_real_only[f'PC{i+1}'] = reduced_real[:, i]
|
| 1079 |
|
|
|
|
| 1080 |
unique_labels_real = sorted(df_real_only['label'].unique().tolist())
|
| 1081 |
|
| 1082 |
# Mapeo de colores para las muestras reales usando la paleta Reds9
|
|
|
|
| 1088 |
real_color_mapping = {label: red_palette[i] for i, label in enumerate(unique_labels_real)}
|
| 1089 |
|
| 1090 |
# Mostrar tabla de Explained Variance Ratio
|
| 1091 |
+
if reduction_method == "PCA":
|
| 1092 |
+
explained_variance_real = reducer_real.named_steps["pca"].explained_variance_ratio_
|
| 1093 |
+
st.subheader("PCA - Real: Explained Variance Ratio")
|
| 1094 |
+
component_names_real = [f"PC{i+1}" for i in range(len(explained_variance_real))]
|
| 1095 |
+
variance_df_real = pd.DataFrame({
|
| 1096 |
+
"Component": component_names_real,
|
| 1097 |
+
"Explained Variance": explained_variance_real
|
| 1098 |
+
})
|
| 1099 |
+
st.table(variance_df_real)
|
| 1100 |
|
| 1101 |
# Mostrar los plots de loadings para cada componente
|
| 1102 |
# st.subheader("PCA - Real: Component Loadings")
|