de-Rodrigo commited on
Commit
f5f7066
·
1 Parent(s): d967697

Integrate Rescale in a Pipeline

Browse files
Files changed (1) hide show
  1. app.py +17 -13
app.py CHANGED
@@ -9,6 +9,7 @@ from sklearn.decomposition import PCA
9
  from sklearn.manifold import TSNE, trustworthiness
10
  from sklearn.metrics import pairwise_distances
11
  from sklearn.preprocessing import MinMaxScaler
 
12
  import io
13
  import ot
14
  from sklearn.linear_model import LinearRegression
@@ -450,15 +451,16 @@ def calculate_cluster_centers(df, labels):
450
 
451
  def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
452
  if reduction_method == "PCA":
453
- reducer = PCA(n_components=N_COMPONENTS)
 
 
 
454
  else:
455
  reducer = TSNE(n_components=2, random_state=42,
456
  perplexity=tsne_params["perplexity"],
457
  learning_rate=tsne_params["learning_rate"])
458
 
459
  reduced = reducer.fit_transform(df_combined[embedding_cols].values)
460
- scaler = MinMaxScaler(feature_range=(-1, 1))
461
- reduced = scaler.fit_transform(reduced)
462
  # Guardamos el embedding completo (por ejemplo, 4 dimensiones en PCA)
463
  df_combined['embedding'] = list(reduced)
464
  # Si el embedding es 2D, asignamos x e y para visualización
@@ -468,7 +470,7 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
468
 
469
  explained_variance = None
470
  if reduction_method == "PCA":
471
- explained_variance = reducer.explained_variance_ratio_
472
 
473
  trust = None
474
  cont = None
@@ -791,17 +793,19 @@ def run_model(model_name):
791
  # -------------------------------------------------------------------------
792
  # 1. PCA sobre las muestras reales
793
  df_real_only = embeddings["real"].copy()
794
- pca_real = PCA(n_components=N_COMPONENTS)
795
- reduced_real = pca_real.fit_transform(df_real_only[embedding_cols].values)
 
 
 
 
796
 
797
- scaler_real = MinMaxScaler(feature_range=(-1, 1))
798
- reduced_real = scaler_real.fit_transform(reduced_real)
799
 
800
  # Agregar columnas PC1, PC2, … a df_real_only
801
  for i in range(reduced_real.shape[1]):
802
  df_real_only[f'PC{i+1}'] = reduced_real[:, i]
803
 
804
- explained_variance_real = pca_real.explained_variance_ratio_
805
  unique_labels_real = sorted(df_real_only['label'].unique().tolist())
806
 
807
  # Mapeo de colores para las muestras reales usando la paleta Reds9
@@ -824,7 +828,7 @@ def run_model(model_name):
824
  # Mostrar los plots de loadings para cada componente
825
  st.subheader("PCA - Real: Component Loadings")
826
  st.markdown("### Pesos de las Componentes Principales (Loadings) - Conjunto Combinado")
827
- for i, comp in enumerate(pca_real.components_):
828
  source = ColumnDataSource(data=dict(
829
  dimensions=embedding_cols,
830
  weight=comp
@@ -855,7 +859,7 @@ def run_model(model_name):
855
  df_all = {}
856
  # Real
857
  df_real_proj = embeddings["real"].copy()
858
- proj_real = pca_real.transform(df_real_proj[embedding_cols].values)
859
  for i in range(proj_real.shape[1]):
860
  df_real_proj[f'PC{i+1}'] = proj_real[:, i]
861
  df_all["real"] = df_real_proj
@@ -863,7 +867,7 @@ def run_model(model_name):
863
  # Synthetic
864
  if "synthetic" in embeddings:
865
  df_synth_proj = embeddings["synthetic"].copy()
866
- proj_synth = pca_real.transform(df_synth_proj[embedding_cols].values)
867
  for i in range(proj_synth.shape[1]):
868
  df_synth_proj[f'PC{i+1}'] = proj_synth[:, i]
869
  df_all["synthetic"] = df_synth_proj
@@ -871,7 +875,7 @@ def run_model(model_name):
871
  # Pretrained
872
  if "pretrained" in embeddings:
873
  df_pretr_proj = embeddings["pretrained"].copy()
874
- proj_pretr = pca_real.transform(df_pretr_proj[embedding_cols].values)
875
  for i in range(proj_pretr.shape[1]):
876
  df_pretr_proj[f'PC{i+1}'] = proj_pretr[:, i]
877
  df_all["pretrained"] = df_pretr_proj
 
9
  from sklearn.manifold import TSNE, trustworthiness
10
  from sklearn.metrics import pairwise_distances
11
  from sklearn.preprocessing import MinMaxScaler
12
+ from sklearn.pipeline import Pipeline
13
  import io
14
  import ot
15
  from sklearn.linear_model import LinearRegression
 
451
 
452
  def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
453
  if reduction_method == "PCA":
454
+ reducer = Pipeline([
455
+ ("pca", PCA(n_components=N_COMPONENTS)),
456
+ ("scaler", MinMaxScaler(feature_range=(-1, 1)))
457
+ ])
458
  else:
459
  reducer = TSNE(n_components=2, random_state=42,
460
  perplexity=tsne_params["perplexity"],
461
  learning_rate=tsne_params["learning_rate"])
462
 
463
  reduced = reducer.fit_transform(df_combined[embedding_cols].values)
 
 
464
  # Guardamos el embedding completo (por ejemplo, 4 dimensiones en PCA)
465
  df_combined['embedding'] = list(reduced)
466
  # Si el embedding es 2D, asignamos x e y para visualización
 
470
 
471
  explained_variance = None
472
  if reduction_method == "PCA":
473
+ explained_variance = reducer.named_steps["pca"].explained_variance_ratio_
474
 
475
  trust = None
476
  cont = None
 
793
  # -------------------------------------------------------------------------
794
  # 1. PCA sobre las muestras reales
795
  df_real_only = embeddings["real"].copy()
796
+ reducer_real = Pipeline([
797
+ ("pca", PCA(n_components=N_COMPONENTS)),
798
+ ("scaler", MinMaxScaler(feature_range=(-1, 1)))
799
+ ])
800
+
801
+ reduced_real = reducer_real.fit_transform(df_real_only[embedding_cols].values)
802
 
 
 
803
 
804
  # Agregar columnas PC1, PC2, … a df_real_only
805
  for i in range(reduced_real.shape[1]):
806
  df_real_only[f'PC{i+1}'] = reduced_real[:, i]
807
 
808
+ explained_variance_real = reducer_real.named_steps["pca"].explained_variance_ratio_
809
  unique_labels_real = sorted(df_real_only['label'].unique().tolist())
810
 
811
  # Mapeo de colores para las muestras reales usando la paleta Reds9
 
828
  # Mostrar los plots de loadings para cada componente
829
  st.subheader("PCA - Real: Component Loadings")
830
  st.markdown("### Pesos de las Componentes Principales (Loadings) - Conjunto Combinado")
831
+ for i, comp in enumerate(reducer_real.named_steps["pca"].components_):
832
  source = ColumnDataSource(data=dict(
833
  dimensions=embedding_cols,
834
  weight=comp
 
859
  df_all = {}
860
  # Real
861
  df_real_proj = embeddings["real"].copy()
862
+ proj_real = reducer_real.named_steps["pca"].transform(df_real_proj[embedding_cols].values)
863
  for i in range(proj_real.shape[1]):
864
  df_real_proj[f'PC{i+1}'] = proj_real[:, i]
865
  df_all["real"] = df_real_proj
 
867
  # Synthetic
868
  if "synthetic" in embeddings:
869
  df_synth_proj = embeddings["synthetic"].copy()
870
+ proj_synth = reducer_real.named_steps["pca"].transform(df_synth_proj[embedding_cols].values)
871
  for i in range(proj_synth.shape[1]):
872
  df_synth_proj[f'PC{i+1}'] = proj_synth[:, i]
873
  df_all["synthetic"] = df_synth_proj
 
875
  # Pretrained
876
  if "pretrained" in embeddings:
877
  df_pretr_proj = embeddings["pretrained"].copy()
878
+ proj_pretr = reducer_real.named_steps["pca"].transform(df_pretr_proj[embedding_cols].values)
879
  for i in range(proj_pretr.shape[1]):
880
  df_pretr_proj[f'PC{i+1}'] = proj_pretr[:, i]
881
  df_all["pretrained"] = df_pretr_proj