Spaces:
Sleeping
Sleeping
Commit
路
eff2e30
1
Parent(s):
d5b8742
Replicate PCA Considering Only Real Samples
Browse files
app.py
CHANGED
|
@@ -14,7 +14,7 @@ from sklearn.linear_model import LinearRegression
|
|
| 14 |
|
| 15 |
N_COMPONENTS = 2
|
| 16 |
TSNE_NEIGHBOURS = 150
|
| 17 |
-
WEIGHT_FACTOR = 0.
|
| 18 |
|
| 19 |
TOOLTIPS = """
|
| 20 |
<div>
|
|
@@ -747,62 +747,6 @@ def run_model(model_name):
|
|
| 747 |
"Explained Variance": explained_variance_real
|
| 748 |
})
|
| 749 |
st.table(variance_df_real)
|
| 750 |
-
|
| 751 |
-
# Agregar scatter plot para visualizar el PCA real
|
| 752 |
-
st.subheader("PCA - Real: Scatter Plot")
|
| 753 |
-
fig_real = figure(
|
| 754 |
-
title="PCA - Solo Real: Scatter Plot",
|
| 755 |
-
plot_width=600,
|
| 756 |
-
plot_height=600,
|
| 757 |
-
tools="pan,wheel_zoom,reset,save,hover",
|
| 758 |
-
active_scroll="wheel_zoom",
|
| 759 |
-
background_fill_color="white"
|
| 760 |
-
)
|
| 761 |
-
# Mostrar solo grid horizontal
|
| 762 |
-
fig_real.xgrid.grid_line_color = None
|
| 763 |
-
fig_real.ygrid.grid_line_color = "gray"
|
| 764 |
-
|
| 765 |
-
# Dibujar los puntos por cada etiqueta
|
| 766 |
-
for label in unique_labels_real:
|
| 767 |
-
subset = df_real_only[df_real_only['label'] == label]
|
| 768 |
-
source_scatter = ColumnDataSource(data={
|
| 769 |
-
'x': subset['x'],
|
| 770 |
-
'y': subset['y'],
|
| 771 |
-
'label': subset['label']
|
| 772 |
-
})
|
| 773 |
-
fig_real.circle('x', 'y', size=10,
|
| 774 |
-
fill_color=real_color_mapping[label],
|
| 775 |
-
line_color=real_color_mapping[label],
|
| 776 |
-
legend_label=label,
|
| 777 |
-
source=source_scatter)
|
| 778 |
-
|
| 779 |
-
# Calcular el centroide de todos los puntos
|
| 780 |
-
center_x = df_real_only['x'].mean()
|
| 781 |
-
center_y = df_real_only['y'].mean()
|
| 782 |
-
|
| 783 |
-
# Calcular el radio como la m谩xima distancia desde el centroide
|
| 784 |
-
distances = np.sqrt((df_real_only['x'] - center_x)**2 + (df_real_only['y'] - center_y)**2)
|
| 785 |
-
radius = distances.max()
|
| 786 |
-
|
| 787 |
-
# Dibujar el centroide
|
| 788 |
-
fig_real.circle(x=center_x, y=center_y, size=15,
|
| 789 |
-
fill_color="black", line_color="black", legend_label="Centroide")
|
| 790 |
-
|
| 791 |
-
# Dibujar la circunferencia (con l铆nea discontinua)
|
| 792 |
-
fig_real.circle(x=center_x, y=center_y, radius=radius,
|
| 793 |
-
fill_color=None, line_color="black", line_dash="dashed", legend_label="Circunferencia")
|
| 794 |
-
|
| 795 |
-
fig_real.xaxis.axis_label = "PC1"
|
| 796 |
-
fig_real.yaxis.axis_label = "PC2"
|
| 797 |
-
|
| 798 |
-
hover_scatter = fig_real.select_one(HoverTool)
|
| 799 |
-
hover_scatter.tooltips = [("Label", "@label"), ("PC1", "@x"), ("PC2", "@y")]
|
| 800 |
-
|
| 801 |
-
fig_real.legend.location = "top_right"
|
| 802 |
-
st.bokeh_chart(fig_real)
|
| 803 |
-
|
| 804 |
-
# Mostrar el valor del radio debajo del gr谩fico
|
| 805 |
-
st.write(f"El radio de la circunferencia es: {int(radius)}")
|
| 806 |
|
| 807 |
# Mostrar los plots de loadings (Component Loadings)
|
| 808 |
st.subheader("PCA - Real: Component Loadings")
|
|
@@ -834,6 +778,250 @@ def run_model(model_name):
|
|
| 834 |
hover = p.select_one(HoverTool)
|
| 835 |
hover.tooltips = [("Dimensi贸n", "@dimensions"), ("Peso", "@weight")]
|
| 836 |
st.bokeh_chart(p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 837 |
|
| 838 |
|
| 839 |
def main():
|
|
|
|
| 14 |
|
| 15 |
N_COMPONENTS = 2
|
| 16 |
TSNE_NEIGHBOURS = 150
|
| 17 |
+
WEIGHT_FACTOR = 0.05
|
| 18 |
|
| 19 |
TOOLTIPS = """
|
| 20 |
<div>
|
|
|
|
| 747 |
"Explained Variance": explained_variance_real
|
| 748 |
})
|
| 749 |
st.table(variance_df_real)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
|
| 751 |
# Mostrar los plots de loadings (Component Loadings)
|
| 752 |
st.subheader("PCA - Real: Component Loadings")
|
|
|
|
| 778 |
hover = p.select_one(HoverTool)
|
| 779 |
hover.tooltips = [("Dimensi贸n", "@dimensions"), ("Peso", "@weight")]
|
| 780 |
st.bokeh_chart(p)
|
| 781 |
+
|
| 782 |
+
# Segundo PCA: Proyecci贸n de todos los subconjuntos usando los loadings calculados con df_real_only
|
| 783 |
+
st.subheader("PCA - Todos los subconjuntos proyectados (usando loadings de df_real)")
|
| 784 |
+
|
| 785 |
+
# Crear un diccionario para almacenar las proyecciones usando el PCA calculado con las muestras reales (pca_real)
|
| 786 |
+
df_all = {}
|
| 787 |
+
|
| 788 |
+
# Proyectar las muestras reales
|
| 789 |
+
df_real_proj = embeddings["real"].copy()
|
| 790 |
+
proj_real = pca_real.transform(df_real_proj[embedding_cols].values)
|
| 791 |
+
df_real_proj['pc1'] = proj_real[:, 0]
|
| 792 |
+
df_real_proj['pc2'] = proj_real[:, 1]
|
| 793 |
+
df_all["real"] = df_real_proj
|
| 794 |
+
|
| 795 |
+
# Proyectar el subconjunto synthetic, si existe
|
| 796 |
+
if "synthetic" in embeddings:
|
| 797 |
+
df_synth_proj = embeddings["synthetic"].copy()
|
| 798 |
+
proj_synth = pca_real.transform(df_synth_proj[embedding_cols].values)
|
| 799 |
+
df_synth_proj['pc1'] = proj_synth[:, 0]
|
| 800 |
+
df_synth_proj['pc2'] = proj_synth[:, 1]
|
| 801 |
+
df_all["synthetic"] = df_synth_proj
|
| 802 |
+
|
| 803 |
+
# Proyectar el subconjunto pretrained, si existe
|
| 804 |
+
if "pretrained" in embeddings:
|
| 805 |
+
df_pretr_proj = embeddings["pretrained"].copy()
|
| 806 |
+
proj_pretr = pca_real.transform(df_pretr_proj[embedding_cols].values)
|
| 807 |
+
df_pretr_proj['pc1'] = proj_pretr[:, 0]
|
| 808 |
+
df_pretr_proj['pc2'] = proj_pretr[:, 1]
|
| 809 |
+
df_all["pretrained"] = df_pretr_proj
|
| 810 |
+
|
| 811 |
+
# Para utilizar las mismas funciones de plot (create_figure, add_dataset_to_fig, add_synthetic_dataset_to_fig),
|
| 812 |
+
# renombramos las columnas 'pc1' y 'pc2' a 'x' y 'y' en cada dataframe
|
| 813 |
+
for key in df_all:
|
| 814 |
+
df_all[key]["x"] = df_all[key]["pc1"]
|
| 815 |
+
df_all[key]["y"] = df_all[key]["pc2"]
|
| 816 |
+
|
| 817 |
+
# Construir los subconjuntos 煤nicos con la granularidad deseada:
|
| 818 |
+
# - Para "real" y "pretrained": agrupamos por label.
|
| 819 |
+
# - Para "synthetic": agrupamos por la columna "source" (cada source tendr谩 sus labels).
|
| 820 |
+
unique_subsets = {}
|
| 821 |
+
# Real:
|
| 822 |
+
unique_subsets["real"] = sorted(df_all["real"]['label'].unique().tolist())
|
| 823 |
+
# Synthetic:
|
| 824 |
+
if "synthetic" in df_all:
|
| 825 |
+
unique_synth = {}
|
| 826 |
+
for source in df_all["synthetic"]["source"].unique():
|
| 827 |
+
unique_synth[source] = sorted(df_all["synthetic"][df_all["synthetic"]["source"] == source]['label'].unique().tolist())
|
| 828 |
+
unique_subsets["synthetic"] = unique_synth
|
| 829 |
+
else:
|
| 830 |
+
unique_subsets["synthetic"] = {}
|
| 831 |
+
# Pretrained:
|
| 832 |
+
if "pretrained" in df_all:
|
| 833 |
+
unique_subsets["pretrained"] = sorted(df_all["pretrained"]['label'].unique().tolist())
|
| 834 |
+
else:
|
| 835 |
+
unique_subsets["pretrained"] = []
|
| 836 |
+
|
| 837 |
+
# Obtener los mapeos de colores utilizando la funci贸n ya definida
|
| 838 |
+
color_maps = get_color_maps(unique_subsets)
|
| 839 |
+
|
| 840 |
+
# Definir un mapeo de marcadores para los subconjuntos synthetic (granularidad por source)
|
| 841 |
+
marker_mapping = {
|
| 842 |
+
"es-digital-paragraph-degradation-seq": "x",
|
| 843 |
+
"es-digital-line-degradation-seq": "cross",
|
| 844 |
+
"es-digital-seq": "triangle",
|
| 845 |
+
"es-digital-rotation-degradation-seq": "diamond",
|
| 846 |
+
"es-digital-zoom-degradation-seq": "asterisk",
|
| 847 |
+
"es-render-seq": "inverted_triangle"
|
| 848 |
+
}
|
| 849 |
+
|
| 850 |
+
# Ahora, crear la figura utilizando las funciones existentes para mantener la granularidad:
|
| 851 |
+
# Se plotean las muestras reales, synthetic (por source) y pretrained con sus respectivos marcadores y colores.
|
| 852 |
+
fig_all = figure(
|
| 853 |
+
title="PCA - Todos los subconjuntos proyectados",
|
| 854 |
+
plot_width=600,
|
| 855 |
+
plot_height=600,
|
| 856 |
+
tools="pan,wheel_zoom,reset,save,hover",
|
| 857 |
+
active_scroll="wheel_zoom",
|
| 858 |
+
background_fill_color="white"
|
| 859 |
+
)
|
| 860 |
+
# Solo grid horizontal
|
| 861 |
+
fig_all.xgrid.grid_line_color = None
|
| 862 |
+
fig_all.ygrid.grid_line_color = "gray"
|
| 863 |
+
|
| 864 |
+
# Ploteamos los puntos de las muestras reales (agrupados por label)
|
| 865 |
+
for label in unique_subsets["real"]:
|
| 866 |
+
subset = df_all["real"][df_all["real"]['label'] == label]
|
| 867 |
+
source = ColumnDataSource(data={
|
| 868 |
+
'x': subset['x'],
|
| 869 |
+
'y': subset['y'],
|
| 870 |
+
'label': subset['label']
|
| 871 |
+
})
|
| 872 |
+
# Usamos 'circle' para las reales
|
| 873 |
+
fig_all.circle('x', 'y', size=10,
|
| 874 |
+
fill_color=color_maps["real"][label],
|
| 875 |
+
line_color=color_maps["real"][label],
|
| 876 |
+
legend_label=f"Real: {label}",
|
| 877 |
+
source=source)
|
| 878 |
+
|
| 879 |
+
# Ploteamos los puntos de synthetic, diferenciando cada source con su marcador
|
| 880 |
+
if unique_subsets["synthetic"]:
|
| 881 |
+
for source_name, labels in unique_subsets["synthetic"].items():
|
| 882 |
+
df_source = df_all["synthetic"][df_all["synthetic"]["source"] == source_name]
|
| 883 |
+
marker = marker_mapping.get(source_name, "square")
|
| 884 |
+
# Para cada label en ese source, usamos la funci贸n auxiliar
|
| 885 |
+
renderers = add_synthetic_dataset_to_fig(fig_all, df_source, labels,
|
| 886 |
+
marker=marker,
|
| 887 |
+
color_mapping=color_maps["synthetic"][source_name],
|
| 888 |
+
group_label=source_name)
|
| 889 |
+
# Ploteamos los puntos de pretrained (agrupados por label)
|
| 890 |
+
if unique_subsets["pretrained"]:
|
| 891 |
+
for label in unique_subsets["pretrained"]:
|
| 892 |
+
subset = df_all["pretrained"][df_all["pretrained"]['label'] == label]
|
| 893 |
+
source = ColumnDataSource(data={
|
| 894 |
+
'x': subset['x'],
|
| 895 |
+
'y': subset['y'],
|
| 896 |
+
'label': subset['label']
|
| 897 |
+
})
|
| 898 |
+
# Usamos 'triangle' para pretrained (por ejemplo)
|
| 899 |
+
fig_all.triangle('x', 'y', size=10,
|
| 900 |
+
fill_color=color_maps["pretrained"][label],
|
| 901 |
+
line_color=color_maps["pretrained"][label],
|
| 902 |
+
legend_label=f"Pretrained: {label}",
|
| 903 |
+
source=source)
|
| 904 |
+
|
| 905 |
+
# Calcular el centroide y el radio (usando solo las muestras reales)
|
| 906 |
+
center_x = df_all["real"]['x'].mean()
|
| 907 |
+
center_y = df_all["real"]['y'].mean()
|
| 908 |
+
distances = np.sqrt((df_all["real"]['x'] - center_x)**2 + (df_all["real"]['y'] - center_y)**2)
|
| 909 |
+
radius = distances.max()
|
| 910 |
+
|
| 911 |
+
# Dibujar el centroide y la circunferencia en el plot
|
| 912 |
+
fig_all.circle(x=center_x, y=center_y, size=15,
|
| 913 |
+
fill_color="black", line_color="black", legend_label="Centroide")
|
| 914 |
+
fig_all.circle(x=center_x, y=center_y, radius=radius,
|
| 915 |
+
fill_color=None, line_color="black", line_dash="dashed", legend_label="Circunferencia")
|
| 916 |
+
|
| 917 |
+
fig_all.xaxis.axis_label = "PC1"
|
| 918 |
+
fig_all.yaxis.axis_label = "PC2"
|
| 919 |
+
hover_all = fig_all.select_one(HoverTool)
|
| 920 |
+
hover_all.tooltips = [("Label", "@label"), ("PC1", "@x"), ("PC2", "@y")]
|
| 921 |
+
|
| 922 |
+
# Agregar checkbox para mostrar u ocultar la leyenda, igual que en el primer PCA
|
| 923 |
+
show_legend_second = st.checkbox("Show Legend", value=False, key=f"legend_second_{model_name}")
|
| 924 |
+
fig_all.legend.visible = show_legend_second
|
| 925 |
+
fig_all.legend.location = "top_right"
|
| 926 |
+
|
| 927 |
+
st.bokeh_chart(fig_all)
|
| 928 |
+
|
| 929 |
+
# Mostrar el valor del radio debajo del gr谩fico
|
| 930 |
+
st.write(f"El radio de la circunferencia (calculado a partir de las muestras reales) es: {radius:.4f}")
|
| 931 |
+
|
| 932 |
+
|
| 933 |
+
# --- C谩lculo de distancias y scatter plot de Distance vs F1 para el nuevo PCA ---
|
| 934 |
+
|
| 935 |
+
# Se calcula la distancia de cada subset synthetic a cada subset real usando los datos proyectados (df_all)
|
| 936 |
+
# Se utiliza la funci贸n compute_cluster_distances_synthetic_individual ya definida
|
| 937 |
+
real_labels_new = sorted(df_all["real"]['label'].unique().tolist())
|
| 938 |
+
df_distances_new = compute_cluster_distances_synthetic_individual(
|
| 939 |
+
df_all["synthetic"],
|
| 940 |
+
df_all["real"],
|
| 941 |
+
real_labels_new,
|
| 942 |
+
metric="wasserstein", # Puedes cambiar la m茅trica seg煤n lo requieras
|
| 943 |
+
bins=20
|
| 944 |
+
)
|
| 945 |
+
|
| 946 |
+
# Extraer las distancias globales (por cada source) del dataframe obtenido,
|
| 947 |
+
# buscando filas cuyo 铆ndice comience con "Global" (formato "Global (source)")
|
| 948 |
+
global_distances_new = {}
|
| 949 |
+
for idx in df_distances_new.index:
|
| 950 |
+
if idx.startswith("Global"):
|
| 951 |
+
source_name = idx.split("(")[1].rstrip(")")
|
| 952 |
+
global_distances_new[source_name] = df_distances_new.loc[idx].values
|
| 953 |
+
|
| 954 |
+
# Ahora, relacionar estas distancias con los valores de F1 (ya cargados en df_f1)
|
| 955 |
+
all_x_new = []
|
| 956 |
+
all_y_new = []
|
| 957 |
+
for source in df_f1.columns:
|
| 958 |
+
if source in global_distances_new:
|
| 959 |
+
x_vals = global_distances_new[source]
|
| 960 |
+
y_vals = df_f1[source].values
|
| 961 |
+
all_x_new.extend(x_vals)
|
| 962 |
+
all_y_new.extend(y_vals)
|
| 963 |
+
all_x_arr_new = np.array(all_x_new).reshape(-1, 1)
|
| 964 |
+
all_y_arr_new = np.array(all_y_new)
|
| 965 |
+
|
| 966 |
+
# Realizar la regresi贸n lineal global sobre estos datos
|
| 967 |
+
model_global_new = LinearRegression().fit(all_x_arr_new, all_y_arr_new)
|
| 968 |
+
r2_new = model_global_new.score(all_x_arr_new, all_y_arr_new)
|
| 969 |
+
slope_new = model_global_new.coef_[0]
|
| 970 |
+
intercept_new = model_global_new.intercept_
|
| 971 |
+
|
| 972 |
+
# Crear el scatter plot
|
| 973 |
+
scatter_fig_new = figure(
|
| 974 |
+
width=600,
|
| 975 |
+
height=600,
|
| 976 |
+
tools="pan,wheel_zoom,reset,save,hover",
|
| 977 |
+
active_scroll="wheel_zoom",
|
| 978 |
+
title="Scatter Plot: Distance vs F1 (Nueva PCA)",
|
| 979 |
+
background_fill_color="white"
|
| 980 |
+
)
|
| 981 |
+
# Configurar 煤nicamente grid horizontal
|
| 982 |
+
scatter_fig_new.xgrid.grid_line_color = None
|
| 983 |
+
scatter_fig_new.ygrid.grid_line_color = "gray"
|
| 984 |
+
|
| 985 |
+
# Mantenemos el mismo c贸digo de colores que en el otro scatter plot
|
| 986 |
+
source_colors = {
|
| 987 |
+
"es-digital-paragraph-degradation-seq": "blue",
|
| 988 |
+
"es-digital-line-degradation-seq": "green",
|
| 989 |
+
"es-digital-seq": "red",
|
| 990 |
+
"es-digital-zoom-degradation-seq": "orange",
|
| 991 |
+
"es-digital-rotation-degradation-seq": "purple",
|
| 992 |
+
"es-digital-rotation-zoom-degradation-seq": "brown",
|
| 993 |
+
"es-render-seq": "cyan"
|
| 994 |
+
}
|
| 995 |
+
|
| 996 |
+
# Dibujar cada conjunto: para cada source (por ejemplo, es-render-seq, etc.)
|
| 997 |
+
for source in df_f1.columns:
|
| 998 |
+
if source in global_distances_new:
|
| 999 |
+
x_vals = global_distances_new[source]
|
| 1000 |
+
y_vals = df_f1[source].values
|
| 1001 |
+
data = {"x": x_vals, "y": y_vals, "Fuente": [source]*len(x_vals)}
|
| 1002 |
+
cds = ColumnDataSource(data=data)
|
| 1003 |
+
scatter_fig_new.circle(
|
| 1004 |
+
'x', 'y', size=8, alpha=0.7, source=cds,
|
| 1005 |
+
fill_color=source_colors.get(source, "gray"),
|
| 1006 |
+
line_color=source_colors.get(source, "gray"),
|
| 1007 |
+
legend_label=source
|
| 1008 |
+
)
|
| 1009 |
+
|
| 1010 |
+
scatter_fig_new.xaxis.axis_label = "Distance (Global, por Colegio) - Nueva PCA"
|
| 1011 |
+
scatter_fig_new.yaxis.axis_label = "F1 Score"
|
| 1012 |
+
scatter_fig_new.legend.location = "top_right"
|
| 1013 |
+
|
| 1014 |
+
hover_tool_new = scatter_fig_new.select_one(HoverTool)
|
| 1015 |
+
hover_tool_new.tooltips = [("Distance", "@x"), ("F1", "@y"), ("Subset", "@Fuente")]
|
| 1016 |
+
|
| 1017 |
+
# Dibujar la l铆nea de regresi贸n global
|
| 1018 |
+
x_line_new = np.linspace(all_x_arr_new.min(), all_x_arr_new.max(), 100)
|
| 1019 |
+
y_line_new = model_global_new.predict(x_line_new.reshape(-1,1))
|
| 1020 |
+
scatter_fig_new.line(x_line_new, y_line_new, line_width=2, line_color="black", legend_label="Global Regression")
|
| 1021 |
+
|
| 1022 |
+
st.bokeh_chart(scatter_fig_new)
|
| 1023 |
+
|
| 1024 |
+
st.write(f"Regresi贸n global (Nueva PCA): R虏 = {r2_new:.4f}, Slope = {slope_new:.4f}, Intercept = {intercept_new:.4f}")
|
| 1025 |
|
| 1026 |
|
| 1027 |
def main():
|