Spaces:
Running
Running
Commit
·
f5f7066
1
Parent(s):
d967697
Integrate Rescale in a Pipeline
Browse files
app.py
CHANGED
|
@@ -9,6 +9,7 @@ from sklearn.decomposition import PCA
|
|
| 9 |
from sklearn.manifold import TSNE, trustworthiness
|
| 10 |
from sklearn.metrics import pairwise_distances
|
| 11 |
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
| 12 |
import io
|
| 13 |
import ot
|
| 14 |
from sklearn.linear_model import LinearRegression
|
|
@@ -450,15 +451,16 @@ def calculate_cluster_centers(df, labels):
|
|
| 450 |
|
| 451 |
def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
|
| 452 |
if reduction_method == "PCA":
|
| 453 |
-
reducer =
|
|
|
|
|
|
|
|
|
|
| 454 |
else:
|
| 455 |
reducer = TSNE(n_components=2, random_state=42,
|
| 456 |
perplexity=tsne_params["perplexity"],
|
| 457 |
learning_rate=tsne_params["learning_rate"])
|
| 458 |
|
| 459 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
| 460 |
-
scaler = MinMaxScaler(feature_range=(-1, 1))
|
| 461 |
-
reduced = scaler.fit_transform(reduced)
|
| 462 |
# Guardamos el embedding completo (por ejemplo, 4 dimensiones en PCA)
|
| 463 |
df_combined['embedding'] = list(reduced)
|
| 464 |
# Si el embedding es 2D, asignamos x e y para visualización
|
|
@@ -468,7 +470,7 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
|
|
| 468 |
|
| 469 |
explained_variance = None
|
| 470 |
if reduction_method == "PCA":
|
| 471 |
-
explained_variance = reducer.explained_variance_ratio_
|
| 472 |
|
| 473 |
trust = None
|
| 474 |
cont = None
|
|
@@ -791,17 +793,19 @@ def run_model(model_name):
|
|
| 791 |
# -------------------------------------------------------------------------
|
| 792 |
# 1. PCA sobre las muestras reales
|
| 793 |
df_real_only = embeddings["real"].copy()
|
| 794 |
-
|
| 795 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
|
| 797 |
-
scaler_real = MinMaxScaler(feature_range=(-1, 1))
|
| 798 |
-
reduced_real = scaler_real.fit_transform(reduced_real)
|
| 799 |
|
| 800 |
# Agregar columnas PC1, PC2, … a df_real_only
|
| 801 |
for i in range(reduced_real.shape[1]):
|
| 802 |
df_real_only[f'PC{i+1}'] = reduced_real[:, i]
|
| 803 |
|
| 804 |
-
explained_variance_real =
|
| 805 |
unique_labels_real = sorted(df_real_only['label'].unique().tolist())
|
| 806 |
|
| 807 |
# Mapeo de colores para las muestras reales usando la paleta Reds9
|
|
@@ -824,7 +828,7 @@ def run_model(model_name):
|
|
| 824 |
# Mostrar los plots de loadings para cada componente
|
| 825 |
st.subheader("PCA - Real: Component Loadings")
|
| 826 |
st.markdown("### Pesos de las Componentes Principales (Loadings) - Conjunto Combinado")
|
| 827 |
-
for i, comp in enumerate(
|
| 828 |
source = ColumnDataSource(data=dict(
|
| 829 |
dimensions=embedding_cols,
|
| 830 |
weight=comp
|
|
@@ -855,7 +859,7 @@ def run_model(model_name):
|
|
| 855 |
df_all = {}
|
| 856 |
# Real
|
| 857 |
df_real_proj = embeddings["real"].copy()
|
| 858 |
-
proj_real =
|
| 859 |
for i in range(proj_real.shape[1]):
|
| 860 |
df_real_proj[f'PC{i+1}'] = proj_real[:, i]
|
| 861 |
df_all["real"] = df_real_proj
|
|
@@ -863,7 +867,7 @@ def run_model(model_name):
|
|
| 863 |
# Synthetic
|
| 864 |
if "synthetic" in embeddings:
|
| 865 |
df_synth_proj = embeddings["synthetic"].copy()
|
| 866 |
-
proj_synth =
|
| 867 |
for i in range(proj_synth.shape[1]):
|
| 868 |
df_synth_proj[f'PC{i+1}'] = proj_synth[:, i]
|
| 869 |
df_all["synthetic"] = df_synth_proj
|
|
@@ -871,7 +875,7 @@ def run_model(model_name):
|
|
| 871 |
# Pretrained
|
| 872 |
if "pretrained" in embeddings:
|
| 873 |
df_pretr_proj = embeddings["pretrained"].copy()
|
| 874 |
-
proj_pretr =
|
| 875 |
for i in range(proj_pretr.shape[1]):
|
| 876 |
df_pretr_proj[f'PC{i+1}'] = proj_pretr[:, i]
|
| 877 |
df_all["pretrained"] = df_pretr_proj
|
|
|
|
| 9 |
from sklearn.manifold import TSNE, trustworthiness
|
| 10 |
from sklearn.metrics import pairwise_distances
|
| 11 |
from sklearn.preprocessing import MinMaxScaler
|
| 12 |
+
from sklearn.pipeline import Pipeline
|
| 13 |
import io
|
| 14 |
import ot
|
| 15 |
from sklearn.linear_model import LinearRegression
|
|
|
|
| 451 |
|
| 452 |
def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
|
| 453 |
if reduction_method == "PCA":
|
| 454 |
+
reducer = Pipeline([
|
| 455 |
+
("pca", PCA(n_components=N_COMPONENTS)),
|
| 456 |
+
("scaler", MinMaxScaler(feature_range=(-1, 1)))
|
| 457 |
+
])
|
| 458 |
else:
|
| 459 |
reducer = TSNE(n_components=2, random_state=42,
|
| 460 |
perplexity=tsne_params["perplexity"],
|
| 461 |
learning_rate=tsne_params["learning_rate"])
|
| 462 |
|
| 463 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
|
|
|
|
|
|
| 464 |
# Guardamos el embedding completo (por ejemplo, 4 dimensiones en PCA)
|
| 465 |
df_combined['embedding'] = list(reduced)
|
| 466 |
# Si el embedding es 2D, asignamos x e y para visualización
|
|
|
|
| 470 |
|
| 471 |
explained_variance = None
|
| 472 |
if reduction_method == "PCA":
|
| 473 |
+
explained_variance = reducer.named_steps["pca"].explained_variance_ratio_
|
| 474 |
|
| 475 |
trust = None
|
| 476 |
cont = None
|
|
|
|
| 793 |
# -------------------------------------------------------------------------
|
| 794 |
# 1. PCA sobre las muestras reales
|
| 795 |
df_real_only = embeddings["real"].copy()
|
| 796 |
+
reducer_real = Pipeline([
|
| 797 |
+
("pca", PCA(n_components=N_COMPONENTS)),
|
| 798 |
+
("scaler", MinMaxScaler(feature_range=(-1, 1)))
|
| 799 |
+
])
|
| 800 |
+
|
| 801 |
+
reduced_real = reducer_real.fit_transform(df_real_only[embedding_cols].values)
|
| 802 |
|
|
|
|
|
|
|
| 803 |
|
| 804 |
# Agregar columnas PC1, PC2, … a df_real_only
|
| 805 |
for i in range(reduced_real.shape[1]):
|
| 806 |
df_real_only[f'PC{i+1}'] = reduced_real[:, i]
|
| 807 |
|
| 808 |
+
explained_variance_real = reducer_real.named_steps["pca"].explained_variance_ratio_
|
| 809 |
unique_labels_real = sorted(df_real_only['label'].unique().tolist())
|
| 810 |
|
| 811 |
# Mapeo de colores para las muestras reales usando la paleta Reds9
|
|
|
|
| 828 |
# Mostrar los plots de loadings para cada componente
|
| 829 |
st.subheader("PCA - Real: Component Loadings")
|
| 830 |
st.markdown("### Pesos de las Componentes Principales (Loadings) - Conjunto Combinado")
|
| 831 |
+
for i, comp in enumerate(reducer_real.named_steps["pca"].components_):
|
| 832 |
source = ColumnDataSource(data=dict(
|
| 833 |
dimensions=embedding_cols,
|
| 834 |
weight=comp
|
|
|
|
| 859 |
df_all = {}
|
| 860 |
# Real
|
| 861 |
df_real_proj = embeddings["real"].copy()
|
| 862 |
+
proj_real = reducer_real.named_steps["pca"].transform(df_real_proj[embedding_cols].values)
|
| 863 |
for i in range(proj_real.shape[1]):
|
| 864 |
df_real_proj[f'PC{i+1}'] = proj_real[:, i]
|
| 865 |
df_all["real"] = df_real_proj
|
|
|
|
| 867 |
# Synthetic
|
| 868 |
if "synthetic" in embeddings:
|
| 869 |
df_synth_proj = embeddings["synthetic"].copy()
|
| 870 |
+
proj_synth = reducer_real.named_steps["pca"].transform(df_synth_proj[embedding_cols].values)
|
| 871 |
for i in range(proj_synth.shape[1]):
|
| 872 |
df_synth_proj[f'PC{i+1}'] = proj_synth[:, i]
|
| 873 |
df_all["synthetic"] = df_synth_proj
|
|
|
|
| 875 |
# Pretrained
|
| 876 |
if "pretrained" in embeddings:
|
| 877 |
df_pretr_proj = embeddings["pretrained"].copy()
|
| 878 |
+
proj_pretr = reducer_real.named_steps["pca"].transform(df_pretr_proj[embedding_cols].values)
|
| 879 |
for i in range(proj_pretr.shape[1]):
|
| 880 |
df_pretr_proj[f'PC{i+1}'] = proj_pretr[:, i]
|
| 881 |
df_all["pretrained"] = df_pretr_proj
|