Spaces:

de-Rodrigo
/

Embeddings

Sleeping

App Files Files Community

de-Rodrigo commited on Mar 13

Commit

71eb50f

1 Parent(s): b79fb2d

Add Dimensions

Browse files

Files changed (1) hide show

app.py +86 -90

app.py CHANGED Viewed

@@ -12,6 +12,10 @@ import io
 import ot
 from sklearn.linear_model import LinearRegression
 TOOLTIPS = """
 <div>
     <div>
@@ -37,10 +41,6 @@ def config_style():
     """, unsafe_allow_html=True)
     st.markdown('<h1 class="main-title">Merit Embeddings 🎒📃🏆</h1>', unsafe_allow_html=True)
-# =============================================================================
-# Funciones de carga de datos y procesamiento (sin cambios en su mayoría)
-# =============================================================================
 def load_embeddings(model, version):
     if model == "Donut":
         df_real = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_secret_all_embeddings.csv")
@@ -95,8 +95,10 @@ def load_embeddings(model, version):
         return None
 def split_versions(df_combined, reduced):
-    df_combined['x'] = reduced[:, 0]
-    df_combined['y'] = reduced[:, 1]
     df_real = df_combined[df_combined["version"] == "real"].copy()
     df_synth = df_combined[df_combined["version"] == "synthetic"].copy()
     unique_real = sorted(df_real['label'].unique().tolist())
@@ -107,10 +109,14 @@ def split_versions(df_combined, reduced):
     unique_subsets = {"real": unique_real, "synthetic": unique_synth}
     return df_dict, unique_subsets
-# =============================================================================
-# Funciones para calcular distancias entre clusters según la métrica seleccionada
-# (Wasserstein, Euclidean o KL)
-# =============================================================================
 def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein", bins=20):
     if metric.lower() == "wasserstein":
@@ -125,13 +131,14 @@ def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein"
         center_real = np.mean(real_points, axis=0)
         return np.linalg.norm(center_syn - center_real)
     elif metric.lower() == "kl":
         all_points = np.vstack([synthetic_points, real_points])
-        x_min, y_min = np.min(all_points, axis=0)
-        x_max, y_max = np.max(all_points, axis=0)
-        x_bins = np.linspace(x_min, x_max, bins+1)
-        y_bins = np.linspace(y_min, y_max, bins+1)
-        H_syn, _, _ = np.histogram2d(synthetic_points[:,0], synthetic_points[:,1], bins=[x_bins, y_bins])
-        H_real, _, _ = np.histogram2d(real_points[:,0], real_points[:,1], bins=[x_bins, y_bins])
         eps = 1e-10
         P = H_syn + eps
         Q = H_real + eps
@@ -147,26 +154,22 @@ def compute_cluster_distances_synthetic_individual(synthetic_df: pd.DataFrame, d
     groups = synthetic_df.groupby(['source', 'label'])
     for (source, label), group in groups:
         key = f"{label} ({source})"
-        data = group[['x', 'y']].values
         distances[key] = {}
         for real_label in real_labels:
-            real_data = df_real[df_real['label'] == real_label][['x','y']].values
             d = compute_cluster_distance(data, real_data, metric=metric, bins=bins)
             distances[key][real_label] = d
     for source, group in synthetic_df.groupby('source'):
         key = f"Global ({source})"
-        data = group[['x','y']].values
         distances[key] = {}
         for real_label in real_labels:
-            real_data = df_real[df_real['label'] == real_label][['x','y']].values
             d = compute_cluster_distance(data, real_data, metric=metric, bins=bins)
             distances[key][real_label] = d
     return pd.DataFrame(distances).T
-# =============================================================================
-# Función para calcular continuidad (mide la preservación de la vecindad original en el embedding)
-# =============================================================================
 def compute_continuity(X, X_embedded, n_neighbors=5):
     n = X.shape[0]
     D_high = pairwise_distances(X, metric='euclidean')
@@ -187,10 +190,6 @@ def compute_continuity(X, X_embedded, n_neighbors=5):
     continuity_value = 1 - norm * total
     return continuity_value
-# =============================================================================
-# Funciones de visualización (sin cambios)
-# =============================================================================
 def create_table(df_distances):
     df_table = df_distances.copy()
     df_table.reset_index(inplace=True)
@@ -214,6 +213,7 @@ def create_table(df_distances):
     return data_table, df_table, source_table
 def create_figure(dfs, unique_subsets, color_maps, model_name):
     fig = figure(width=600, height=600, tools="wheel_zoom,pan,reset,save", active_scroll="wheel_zoom", tooltips=TOOLTIPS, title="")
     real_renderers = add_dataset_to_fig(fig, dfs["real"], unique_subsets["real"],
                                         marker="circle", color_mapping=color_maps["real"],
@@ -350,38 +350,36 @@ def calculate_cluster_centers(df, labels):
     centers = {}
     for label in labels:
         subset = df[df['label'] == label]
-        if not subset.empty:
             centers[label] = (subset['x'].mean(), subset['y'].mean())
     return centers
-# =============================================================================
-# Pipeline central: reducción, cálculo de distancias y regresión global.
-# Se agrega el parámetro distance_metric.
-# Además, si se utiliza t-SNE, se calculan trustworthiness y continuity.
-# =============================================================================
 def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
     if reduction_method == "PCA":
-        reducer = PCA(n_components=2)
     else:
-        reducer = TSNE(n_components=2, random_state=42,
                          perplexity=tsne_params["perplexity"],
                          learning_rate=tsne_params["learning_rate"])
     reduced = reducer.fit_transform(df_combined[embedding_cols].values)
-    # Para PCA se captura la explained variance ratio
     explained_variance = None
     if reduction_method == "PCA":
         explained_variance = reducer.explained_variance_ratio_
-    # Si se usa t-SNE, calculamos trustworthiness y continuity
     trust = None
     cont = None
     if reduction_method == "t-SNE":
         X = df_combined[embedding_cols].values
-        trust = trustworthiness(X, reduced, n_neighbors=5)
-        cont = compute_continuity(X, reduced, n_neighbors=5)
     dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
@@ -453,15 +451,11 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
         "dfs_reduced": dfs_reduced,
         "unique_subsets": unique_subsets,
         "df_distances": df_distances,
-        "explained_variance": explained_variance,  # Solo para PCA
-        "trustworthiness": trust,                  # Solo para t-SNE
-        "continuity": cont                         # Solo para t-SNE
     }
-# =============================================================================
-# Optimización de parámetros para TSNE (se propaga también la métrica de distancia)
-# =============================================================================
 def optimize_tsne_params(df_combined, embedding_cols, df_f1, distance_metric):
     perplexity_range = np.linspace(30, 50, 10)
     learning_rate_range = np.linspace(200, 1000, 20)
@@ -490,11 +484,6 @@ def optimize_tsne_params(df_combined, embedding_cols, df_f1, distance_metric):
     progress_text.text("Optimization completed!")
     return best_params, best_R2
-# =============================================================================
-# Función principal run_model: incluye selector de versión, método de reducción, métrica de distancia,
-# y, si se usa t-SNE, muestra trustworthiness y continuity.
-# =============================================================================
 def run_model(model_name):
     version = st.selectbox("Select Model Version:", options=["vanilla", "finetuned_real"], key=f"version_{model_name}")
@@ -556,8 +545,9 @@ def run_model(model_name):
     if reduction_method == "PCA" and result["explained_variance"] is not None:
         st.subheader("Explained Variance Ratio")
         variance_df = pd.DataFrame({
-            "Component": ["PC1", "PC2"],
             "Explained Variance": result["explained_variance"]
         })
         st.table(variance_df)
@@ -565,6 +555,7 @@ def run_model(model_name):
         st.subheader("t-SNE Quality Metrics")
         st.write(f"Trustworthiness: {result['trustworthiness']:.4f}")
         st.write(f"Continuity: {result['continuity']:.4f}")
     data_table, df_table, source_table = create_table(result["df_distances"])
     real_subset_names = list(df_table.columns[1:])
@@ -572,53 +563,58 @@ def run_model(model_name):
     reset_button = Button(label="Reset Colors", button_type="primary")
     line_source = ColumnDataSource(data={'x': [], 'y': []})
-    fig, real_renderers, synthetic_renderers = create_figure(result["dfs_reduced"], result["unique_subsets"], get_color_maps(result["unique_subsets"]), model_name)
-    fig.line('x', 'y', source=line_source, line_width=2, line_color='black')
-    centers_real = calculate_cluster_centers(result["dfs_reduced"]["real"], result["unique_subsets"]["real"])
-    real_centers_js = {k: [v[0], v[1]] for k, v in centers_real.items()}
-    synthetic_centers = {}
-    synth_labels = sorted(result["dfs_reduced"]["synthetic"]['label'].unique().tolist())
-    for label in synth_labels:
-        subset = result["dfs_reduced"]["synthetic"][result["dfs_reduced"]["synthetic"]['label'] == label]
-        synthetic_centers[label] = [subset['x'].mean(), subset['y'].mean()]
-    callback = CustomJS(args=dict(source=source_table, line_source=line_source,
                                   synthetic_centers=synthetic_centers,
                                   real_centers=real_centers_js,
                                   real_select=real_select),
-    code="""
-        var selected = source.selected.indices;
-        if (selected.length > 0) {
-            var idx = selected[0];
-            var data = source.data;
-            var synth_label = data['Synthetic'][idx];
-            var real_label = real_select.value;
-            var syn_coords = synthetic_centers[synth_label];
-            var real_coords = real_centers[real_label];
-            line_source.data = {'x': [syn_coords[0], real_coords[0]], 'y': [syn_coords[1], real_coords[1]]};
-            line_source.change.emit();
-        } else {
             line_source.data = {'x': [], 'y': []};
             line_source.change.emit();
-        }
-    """)
-    source_table.selected.js_on_change('indices', callback)
-    real_select.js_on_change('value', callback)
-    reset_callback = CustomJS(args=dict(line_source=line_source),
-    code="""
-        line_source.data = {'x': [], 'y': []};
-        line_source.change.emit();
-    """)
-    reset_button.js_on_event("button_click", reset_callback)
     buffer = io.BytesIO()
     df_table.to_excel(buffer, index=False)
     buffer.seek(0)
-    layout = column(fig, result["scatter_fig"], column(real_select, reset_button, data_table))
-    st.bokeh_chart(layout, use_container_width=True)
     st.download_button(
         label="Export Table",
         data=buffer,

 import ot
 from sklearn.linear_model import LinearRegression
+# Usaremos 4 componentes para el embedding
+N_COMPONENTS = 100
+TSNE_NEIGHBOURS = 150
 TOOLTIPS = """
 <div>
     <div>
     """, unsafe_allow_html=True)
     st.markdown('<h1 class="main-title">Merit Embeddings 🎒📃🏆</h1>', unsafe_allow_html=True)
 def load_embeddings(model, version):
     if model == "Donut":
         df_real = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_secret_all_embeddings.csv")
         return None
 def split_versions(df_combined, reduced):
+    # Si el embedding es 2D se asignan las columnas x e y para visualización.
+    if reduced.shape[1] == 2:
+        df_combined['x'] = reduced[:, 0]
+        df_combined['y'] = reduced[:, 1]
     df_real = df_combined[df_combined["version"] == "real"].copy()
     df_synth = df_combined[df_combined["version"] == "synthetic"].copy()
     unique_real = sorted(df_real['label'].unique().tolist())
     unique_subsets = {"real": unique_real, "synthetic": unique_synth}
     return df_dict, unique_subsets
+def get_embedding_from_df(df):
+    # Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
+    if 'embedding' in df.columns:
+        return np.stack(df['embedding'].to_numpy())
+    elif 'x' in df.columns and 'y' in df.columns:
+        return df[['x', 'y']].values
+    else:
+        raise ValueError("No se encontró embedding o coordenadas x,y en el DataFrame.")
 def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein", bins=20):
     if metric.lower() == "wasserstein":
         center_real = np.mean(real_points, axis=0)
         return np.linalg.norm(center_syn - center_real)
     elif metric.lower() == "kl":
+        # Para KL usamos histogramas multidimensionales con límites globales en cada dimensión
         all_points = np.vstack([synthetic_points, real_points])
+        edges = [
+            np.linspace(np.min(all_points[:, i]), np.max(all_points[:, i]), bins+1)
+            for i in range(all_points.shape[1])
+        ]
+        H_syn, _ = np.histogramdd(synthetic_points, bins=edges)
+        H_real, _ = np.histogramdd(real_points, bins=edges)
         eps = 1e-10
         P = H_syn + eps
         Q = H_real + eps
     groups = synthetic_df.groupby(['source', 'label'])
     for (source, label), group in groups:
         key = f"{label} ({source})"
+        data = get_embedding_from_df(group)
         distances[key] = {}
         for real_label in real_labels:
+            real_data = get_embedding_from_df(df_real[df_real['label'] == real_label])
             d = compute_cluster_distance(data, real_data, metric=metric, bins=bins)
             distances[key][real_label] = d
     for source, group in synthetic_df.groupby('source'):
         key = f"Global ({source})"
+        data = get_embedding_from_df(group)
         distances[key] = {}
         for real_label in real_labels:
+            real_data = get_embedding_from_df(df_real[df_real['label'] == real_label])
             d = compute_cluster_distance(data, real_data, metric=metric, bins=bins)
             distances[key][real_label] = d
     return pd.DataFrame(distances).T
 def compute_continuity(X, X_embedded, n_neighbors=5):
     n = X.shape[0]
     D_high = pairwise_distances(X, metric='euclidean')
     continuity_value = 1 - norm * total
     return continuity_value
 def create_table(df_distances):
     df_table = df_distances.copy()
     df_table.reset_index(inplace=True)
     return data_table, df_table, source_table
 def create_figure(dfs, unique_subsets, color_maps, model_name):
+    # Se crea solo si el embedding es 2D (ya que se usan 'x' y 'y' para visualizar)
     fig = figure(width=600, height=600, tools="wheel_zoom,pan,reset,save", active_scroll="wheel_zoom", tooltips=TOOLTIPS, title="")
     real_renderers = add_dataset_to_fig(fig, dfs["real"], unique_subsets["real"],
                                         marker="circle", color_mapping=color_maps["real"],
     centers = {}
     for label in labels:
         subset = df[df['label'] == label]
+        if not subset.empty and 'x' in subset.columns and 'y' in subset.columns:
             centers[label] = (subset['x'].mean(), subset['y'].mean())
     return centers
 def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
     if reduction_method == "PCA":
+        reducer = PCA(n_components=N_COMPONENTS)
     else:
+        reducer = TSNE(n_components=3, random_state=42,
                          perplexity=tsne_params["perplexity"],
                          learning_rate=tsne_params["learning_rate"])
     reduced = reducer.fit_transform(df_combined[embedding_cols].values)
+    # Guardamos el embedding completo (4 dimensiones para PCA)
+    df_combined['embedding'] = list(reduced)
+    # Si el embedding es 2D (por t-SNE o PCA con 2 componentes) asignamos x e y para visualización
+    if reduced.shape[1] == 2:
+        df_combined['x'] = reduced[:, 0]
+        df_combined['y'] = reduced[:, 1]
     explained_variance = None
     if reduction_method == "PCA":
         explained_variance = reducer.explained_variance_ratio_
     trust = None
     cont = None
     if reduction_method == "t-SNE":
         X = df_combined[embedding_cols].values
+        trust = trustworthiness(X, reduced, n_neighbors=TSNE_NEIGHBOURS)
+        cont = compute_continuity(X, reduced, n_neighbors=TSNE_NEIGHBOURS)
     dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
         "dfs_reduced": dfs_reduced,
         "unique_subsets": unique_subsets,
         "df_distances": df_distances,
+        "explained_variance": explained_variance,
+        "trustworthiness": trust,
+        "continuity": cont
     }
 def optimize_tsne_params(df_combined, embedding_cols, df_f1, distance_metric):
     perplexity_range = np.linspace(30, 50, 10)
     learning_rate_range = np.linspace(200, 1000, 20)
     progress_text.text("Optimization completed!")
     return best_params, best_R2
 def run_model(model_name):
     version = st.selectbox("Select Model Version:", options=["vanilla", "finetuned_real"], key=f"version_{model_name}")
     if reduction_method == "PCA" and result["explained_variance"] is not None:
         st.subheader("Explained Variance Ratio")
+        component_names = [f"PC{i+1}" for i in range(len(result["explained_variance"]))]
         variance_df = pd.DataFrame({
+            "Component": component_names,
             "Explained Variance": result["explained_variance"]
         })
         st.table(variance_df)
         st.subheader("t-SNE Quality Metrics")
         st.write(f"Trustworthiness: {result['trustworthiness']:.4f}")
         st.write(f"Continuity: {result['continuity']:.4f}")
     data_table, df_table, source_table = create_table(result["df_distances"])
     real_subset_names = list(df_table.columns[1:])
     reset_button = Button(label="Reset Colors", button_type="primary")
     line_source = ColumnDataSource(data={'x': [], 'y': []})
+    # Si el embedding es 2D se crea el scatter plot de embeddings;
+    # dado que con PCA ahora usamos 4 dimensiones, este bloque se omite para PCA
+    if (reduction_method == "t-SNE" and N_COMPONENTS == 2) or (reduction_method == "PCA" and N_COMPONENTS == 2):
+        fig, real_renderers, synthetic_renderers = create_figure(result["dfs_reduced"], result["unique_subsets"], get_color_maps(result["unique_subsets"]), model_name)
+        fig.line('x', 'y', source=line_source, line_width=2, line_color='black')
+        centers_real = calculate_cluster_centers(result["dfs_reduced"]["real"], result["unique_subsets"]["real"])
+        real_centers_js = {k: [v[0], v[1]] for k, v in centers_real.items()}
+        synthetic_centers = {}
+        synth_labels = sorted(result["dfs_reduced"]["synthetic"]['label'].unique().tolist())
+        for label in synth_labels:
+            subset = result["dfs_reduced"]["synthetic"][result["dfs_reduced"]["synthetic"]['label'] == label]
+            if 'x' in subset.columns and 'y' in subset.columns:
+                synthetic_centers[label] = [subset['x'].mean(), subset['y'].mean()]
+        callback = CustomJS(args=dict(source=source_table, line_source=line_source,
                                   synthetic_centers=synthetic_centers,
                                   real_centers=real_centers_js,
                                   real_select=real_select),
+        code="""
+            var selected = source.selected.indices;
+            if (selected.length > 0) {
+                var idx = selected[0];
+                var data = source.data;
+                var synth_label = data['Synthetic'][idx];
+                var real_label = real_select.value;
+                var syn_coords = synthetic_centers[synth_label];
+                var real_coords = real_centers[real_label];
+                line_source.data = {'x': [syn_coords[0], real_coords[0]], 'y': [syn_coords[1], real_coords[1]]};
+                line_source.change.emit();
+            } else {
+                line_source.data = {'x': [], 'y': []};
+                line_source.change.emit();
+            }
+        """)
+        source_table.selected.js_on_change('indices', callback)
+        real_select.js_on_change('value', callback)
+        reset_callback = CustomJS(args=dict(line_source=line_source),
+        code="""
             line_source.data = {'x': [], 'y': []};
             line_source.change.emit();
+        """)
+        reset_button.js_on_event("button_click", reset_callback)
+        layout = column(fig, result["scatter_fig"], column(real_select, reset_button, data_table))
+    else:
+        layout = column(result["scatter_fig"], column(real_select, reset_button, data_table))
+    st.bokeh_chart(layout, use_container_width=True)
     buffer = io.BytesIO()
     df_table.to_excel(buffer, index=False)
     buffer.seek(0)
     st.download_button(
         label="Export Table",
         data=buffer,