de-Rodrigo commited on
Commit
7b849c4
·
1 Parent(s): 1db0fcf

Try Visualization

Browse files
Files changed (1) hide show
  1. app.py +49 -21
app.py CHANGED
@@ -52,10 +52,12 @@ class RelativeScaler(BaseEstimator, TransformerMixin):
52
 
53
  return np.hstack(transformed)
54
 
 
55
  N_COMPONENTS = 3
56
  TSNE_NEIGHBOURS = 15
57
  # WEIGHT_FACTOR = 0.05
58
 
 
59
  TOOLTIPS = """
60
  <div>
61
  <div>
@@ -70,6 +72,7 @@ TOOLTIPS = """
70
  </div>
71
  """
72
 
 
73
  def config_style():
74
  # st.set_page_config(layout="wide")
75
  st.markdown("""
@@ -85,6 +88,7 @@ def config_style():
85
  """, unsafe_allow_html=True)
86
  st.markdown('<h1 class="main-title">Merit Embeddings 🎒📃🏆</h1>', unsafe_allow_html=True)
87
 
 
88
  def load_embeddings(model, version, embedding_prefix, weight_factor):
89
  if model == "Donut":
90
  df_real = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_secret_all_{weight_factor}embeddings.csv")
@@ -275,6 +279,7 @@ def load_embeddings(model, version, embedding_prefix, weight_factor):
275
  st.error("Modelo no reconocido")
276
  return None
277
 
 
278
  def split_versions(df_combined, reduced):
279
  # Asignar las coordenadas si la reducción es 2D
280
  if reduced.shape[1] == 2:
@@ -294,6 +299,7 @@ def split_versions(df_combined, reduced):
294
  unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
295
  return df_dict, unique_subsets
296
 
 
297
  def get_embedding_from_df(df):
298
  # Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
299
  if 'embedding' in df.columns:
@@ -303,6 +309,7 @@ def get_embedding_from_df(df):
303
  else:
304
  raise ValueError("No se encontró embedding o coordenadas x,y en el DataFrame.")
305
 
 
306
  def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein", bins=20):
307
  if metric.lower() == "wasserstein":
308
  n = synthetic_points.shape[0]
@@ -334,6 +341,7 @@ def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein"
334
  else:
335
  raise ValueError("Métrica desconocida. Usa 'wasserstein', 'euclidean' o 'kl'.")
336
 
 
337
  def compute_cluster_distances_synthetic_individual(synthetic_df: pd.DataFrame, df_real: pd.DataFrame, real_labels: list, metric="wasserstein", bins=20) -> pd.DataFrame:
338
  distances = {}
339
  groups = synthetic_df.groupby(['source', 'label'])
@@ -355,6 +363,7 @@ def compute_cluster_distances_synthetic_individual(synthetic_df: pd.DataFrame, d
355
  distances[key][real_label] = d
356
  return pd.DataFrame(distances).T
357
 
 
358
  def compute_continuity(X, X_embedded, n_neighbors=5):
359
  n = X.shape[0]
360
  D_high = pairwise_distances(X, metric='euclidean')
@@ -375,6 +384,7 @@ def compute_continuity(X, X_embedded, n_neighbors=5):
375
  continuity_value = 1 - norm * total
376
  return continuity_value
377
 
 
378
  def create_table(df_distances):
379
  df_table = df_distances.copy()
380
  df_table.reset_index(inplace=True)
@@ -438,6 +448,7 @@ def create_figure(dfs, unique_subsets, color_maps, model_name):
438
  fig.legend.visible = show_legend
439
  return fig, real_renderers, synthetic_renderers, pretrained_renderers
440
 
 
441
  def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
442
  renderers = {}
443
  for label in selected_labels:
@@ -467,6 +478,7 @@ def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_la
467
  renderers[label + f" ({group_label})"] = r
468
  return renderers
469
 
 
470
  def add_synthetic_dataset_to_fig(fig, df, labels, marker, color_mapping, group_label):
471
  renderers = {}
472
  for label in labels:
@@ -516,6 +528,7 @@ def add_synthetic_dataset_to_fig(fig, df, labels, marker, color_mapping, group_l
516
  renderers[label + f" ({group_label})"] = r
517
  return renderers
518
 
 
519
  def get_color_maps(unique_subsets, result):
520
  color_map = {}
521
  num_real = len(unique_subsets["real"])
@@ -548,6 +561,7 @@ def get_color_maps(unique_subsets, result):
548
 
549
  return color_map
550
 
 
551
  def calculate_cluster_centers(df, labels):
552
  centers = {}
553
  for label in labels:
@@ -556,22 +570,27 @@ def calculate_cluster_centers(df, labels):
556
  centers[label] = (subset['x'].mean(), subset['y'].mean())
557
  return centers
558
 
 
559
  def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
560
  if reduction_method == "PCA":
561
  reducer = Pipeline([
562
  ("pca", PCA(n_components=N_COMPONENTS)),
563
  ("rel_scaler", RelativeScaler())
564
  ])
 
565
  elif reduction_method == "UMAP":
566
  reducer = umap.UMAP(n_components=N_COMPONENTS,
567
- random_state=42,
568
- n_neighbors=15,
569
- min_dist=0.1,
570
- metric='cosine')
 
 
571
  else:
572
  reducer = TSNE(n_components=2, random_state=42,
573
- perplexity=tsne_params["perplexity"],
574
- learning_rate=tsne_params["learning_rate"])
 
575
 
576
  reduced = reducer.fit_transform(df_combined[embedding_cols].values)
577
  # reduced = reducer.fit_transform(df_combined[df_combined["version"] == "real"][embedding_cols].values)
@@ -874,7 +893,6 @@ def run_model(model_name):
874
  tsne_params = {"perplexity": perplexity_val, "learning_rate": learning_rate_val}
875
 
876
  result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method=reduction_method, distance_metric=distance_metric.lower())
877
- print(result)
878
 
879
  reg_metrics = pd.DataFrame({
880
  "Slope": [result["slope"]],
@@ -1033,24 +1051,32 @@ def run_model(model_name):
1033
  key=f"download_button_excel_{model_name}"
1034
  )
1035
 
1036
- if reduction_method == "PCA":
1037
  st.markdown("## PCA - Solo Muestras Reales")
1038
  # -------------------------------------------------------------------------
1039
  # 1. PCA sobre las muestras reales
1040
  df_real_only = embeddings["real"].copy()
1041
- reducer_real = Pipeline([
1042
- ("pca", PCA(n_components=N_COMPONENTS)),
1043
- ("rel_scaler", RelativeScaler())
1044
- ])
 
 
 
 
 
 
 
 
 
 
1045
 
1046
  reduced_real = reducer_real.fit_transform(df_real_only[embedding_cols].values)
1047
 
1048
-
1049
  # Agregar columnas PC1, PC2, … a df_real_only
1050
  for i in range(reduced_real.shape[1]):
1051
  df_real_only[f'PC{i+1}'] = reduced_real[:, i]
1052
 
1053
- explained_variance_real = reducer_real.named_steps["pca"].explained_variance_ratio_
1054
  unique_labels_real = sorted(df_real_only['label'].unique().tolist())
1055
 
1056
  # Mapeo de colores para las muestras reales usando la paleta Reds9
@@ -1062,13 +1088,15 @@ def run_model(model_name):
1062
  real_color_mapping = {label: red_palette[i] for i, label in enumerate(unique_labels_real)}
1063
 
1064
  # Mostrar tabla de Explained Variance Ratio
1065
- st.subheader("PCA - Real: Explained Variance Ratio")
1066
- component_names_real = [f"PC{i+1}" for i in range(len(explained_variance_real))]
1067
- variance_df_real = pd.DataFrame({
1068
- "Component": component_names_real,
1069
- "Explained Variance": explained_variance_real
1070
- })
1071
- st.table(variance_df_real)
 
 
1072
 
1073
  # Mostrar los plots de loadings para cada componente
1074
  # st.subheader("PCA - Real: Component Loadings")
 
52
 
53
  return np.hstack(transformed)
54
 
55
+
56
  N_COMPONENTS = 3
57
  TSNE_NEIGHBOURS = 15
58
  # WEIGHT_FACTOR = 0.05
59
 
60
+
61
  TOOLTIPS = """
62
  <div>
63
  <div>
 
72
  </div>
73
  """
74
 
75
+
76
  def config_style():
77
  # st.set_page_config(layout="wide")
78
  st.markdown("""
 
88
  """, unsafe_allow_html=True)
89
  st.markdown('<h1 class="main-title">Merit Embeddings 🎒📃🏆</h1>', unsafe_allow_html=True)
90
 
91
+
92
  def load_embeddings(model, version, embedding_prefix, weight_factor):
93
  if model == "Donut":
94
  df_real = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_secret_all_{weight_factor}embeddings.csv")
 
279
  st.error("Modelo no reconocido")
280
  return None
281
 
282
+
283
  def split_versions(df_combined, reduced):
284
  # Asignar las coordenadas si la reducción es 2D
285
  if reduced.shape[1] == 2:
 
299
  unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
300
  return df_dict, unique_subsets
301
 
302
+
303
  def get_embedding_from_df(df):
304
  # Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
305
  if 'embedding' in df.columns:
 
309
  else:
310
  raise ValueError("No se encontró embedding o coordenadas x,y en el DataFrame.")
311
 
312
+
313
  def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein", bins=20):
314
  if metric.lower() == "wasserstein":
315
  n = synthetic_points.shape[0]
 
341
  else:
342
  raise ValueError("Métrica desconocida. Usa 'wasserstein', 'euclidean' o 'kl'.")
343
 
344
+
345
  def compute_cluster_distances_synthetic_individual(synthetic_df: pd.DataFrame, df_real: pd.DataFrame, real_labels: list, metric="wasserstein", bins=20) -> pd.DataFrame:
346
  distances = {}
347
  groups = synthetic_df.groupby(['source', 'label'])
 
363
  distances[key][real_label] = d
364
  return pd.DataFrame(distances).T
365
 
366
+
367
  def compute_continuity(X, X_embedded, n_neighbors=5):
368
  n = X.shape[0]
369
  D_high = pairwise_distances(X, metric='euclidean')
 
384
  continuity_value = 1 - norm * total
385
  return continuity_value
386
 
387
+
388
  def create_table(df_distances):
389
  df_table = df_distances.copy()
390
  df_table.reset_index(inplace=True)
 
448
  fig.legend.visible = show_legend
449
  return fig, real_renderers, synthetic_renderers, pretrained_renderers
450
 
451
+
452
  def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
453
  renderers = {}
454
  for label in selected_labels:
 
478
  renderers[label + f" ({group_label})"] = r
479
  return renderers
480
 
481
+
482
  def add_synthetic_dataset_to_fig(fig, df, labels, marker, color_mapping, group_label):
483
  renderers = {}
484
  for label in labels:
 
528
  renderers[label + f" ({group_label})"] = r
529
  return renderers
530
 
531
+
532
  def get_color_maps(unique_subsets, result):
533
  color_map = {}
534
  num_real = len(unique_subsets["real"])
 
561
 
562
  return color_map
563
 
564
+
565
  def calculate_cluster_centers(df, labels):
566
  centers = {}
567
  for label in labels:
 
570
  centers[label] = (subset['x'].mean(), subset['y'].mean())
571
  return centers
572
 
573
+
574
  def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
575
  if reduction_method == "PCA":
576
  reducer = Pipeline([
577
  ("pca", PCA(n_components=N_COMPONENTS)),
578
  ("rel_scaler", RelativeScaler())
579
  ])
580
+
581
  elif reduction_method == "UMAP":
582
  reducer = umap.UMAP(n_components=N_COMPONENTS,
583
+ random_state=42,
584
+ n_neighbors=15,
585
+ min_dist=0.1,
586
+ metric='cosine'
587
+ )
588
+
589
  else:
590
  reducer = TSNE(n_components=2, random_state=42,
591
+ perplexity=tsne_params["perplexity"],
592
+ learning_rate=tsne_params["learning_rate"]
593
+ )
594
 
595
  reduced = reducer.fit_transform(df_combined[embedding_cols].values)
596
  # reduced = reducer.fit_transform(df_combined[df_combined["version"] == "real"][embedding_cols].values)
 
893
  tsne_params = {"perplexity": perplexity_val, "learning_rate": learning_rate_val}
894
 
895
  result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method=reduction_method, distance_metric=distance_metric.lower())
 
896
 
897
  reg_metrics = pd.DataFrame({
898
  "Slope": [result["slope"]],
 
1051
  key=f"download_button_excel_{model_name}"
1052
  )
1053
 
1054
+ if reduction_method in ("PCA", "UMAP"):
1055
  st.markdown("## PCA - Solo Muestras Reales")
1056
  # -------------------------------------------------------------------------
1057
  # 1. PCA sobre las muestras reales
1058
  df_real_only = embeddings["real"].copy()
1059
+
1060
+ if reduction_method == "PCA":
1061
+ reducer_real = Pipeline([
1062
+ ("pca", PCA(n_components=N_COMPONENTS)),
1063
+ ("rel_scaler", RelativeScaler())
1064
+ ])
1065
+
1066
+ elif reduction_method == "UMAP":
1067
+ reducer_real = umap.UMAP(n_components=N_COMPONENTS,
1068
+ random_state=42,
1069
+ n_neighbors=15,
1070
+ min_dist=0.1,
1071
+ metric='cosine'
1072
+ )
1073
 
1074
  reduced_real = reducer_real.fit_transform(df_real_only[embedding_cols].values)
1075
 
 
1076
  # Agregar columnas PC1, PC2, … a df_real_only
1077
  for i in range(reduced_real.shape[1]):
1078
  df_real_only[f'PC{i+1}'] = reduced_real[:, i]
1079
 
 
1080
  unique_labels_real = sorted(df_real_only['label'].unique().tolist())
1081
 
1082
  # Mapeo de colores para las muestras reales usando la paleta Reds9
 
1088
  real_color_mapping = {label: red_palette[i] for i, label in enumerate(unique_labels_real)}
1089
 
1090
  # Mostrar tabla de Explained Variance Ratio
1091
+ if reduction_method == "PCA":
1092
+ explained_variance_real = reducer_real.named_steps["pca"].explained_variance_ratio_
1093
+ st.subheader("PCA - Real: Explained Variance Ratio")
1094
+ component_names_real = [f"PC{i+1}" for i in range(len(explained_variance_real))]
1095
+ variance_df_real = pd.DataFrame({
1096
+ "Component": component_names_real,
1097
+ "Explained Variance": explained_variance_real
1098
+ })
1099
+ st.table(variance_df_real)
1100
 
1101
  # Mostrar los plots de loadings para cada componente
1102
  # st.subheader("PCA - Real: Component Loadings")