Spaces:
Running
Running
Commit
路
4279043
1
Parent(s):
ce05869
TSNE Parameters Optimization
Browse files
app.py
CHANGED
|
@@ -312,19 +312,105 @@ def create_table(df_distances):
|
|
| 312 |
data_table = DataTable(source=source_table, columns=columns, sizing_mode='stretch_width', height=total_height)
|
| 313 |
return data_table, df_table, source_table
|
| 314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
def run_model(model_name):
|
| 316 |
embeddings = load_embeddings(model_name)
|
| 317 |
if embeddings is None:
|
| 318 |
return
|
| 319 |
-
|
| 320 |
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
|
| 321 |
df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
|
| 323 |
reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
if reduction_method == "PCA":
|
| 325 |
reducer = PCA(n_components=2)
|
| 326 |
else:
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
| 328 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
| 329 |
dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
|
| 330 |
|
|
@@ -389,7 +475,7 @@ def run_model(model_name):
|
|
| 389 |
scatter_fig.legend.location = "top_right"
|
| 390 |
|
| 391 |
# Agregar HoverTool para mostrar x, y y la fuente al hacer hover
|
| 392 |
-
hover_tool = HoverTool(tooltips=[("
|
| 393 |
scatter_fig.add_tools(hover_tool)
|
| 394 |
# --- Fin scatter plot ---
|
| 395 |
|
|
|
|
| 312 |
data_table = DataTable(source=source_table, columns=columns, sizing_mode='stretch_width', height=total_height)
|
| 313 |
return data_table, df_table, source_table
|
| 314 |
|
| 315 |
+
def optimize_tsne_params(df_combined, embedding_cols, df_f1):
|
| 316 |
+
# Rangos de b煤squeda (puedes ajustar estos l铆mites y pasos)
|
| 317 |
+
perplexity_range = np.linspace(30, 50, 10)
|
| 318 |
+
learning_rate_range = np.linspace(200, 1000, 20)
|
| 319 |
+
|
| 320 |
+
best_R2 = -np.inf
|
| 321 |
+
best_params = None
|
| 322 |
+
total_steps = len(perplexity_range) * len(learning_rate_range)
|
| 323 |
+
step = 0
|
| 324 |
+
|
| 325 |
+
# Usamos un placeholder de Streamlit para actualizar mensajes de progreso
|
| 326 |
+
progress_text = st.empty()
|
| 327 |
+
|
| 328 |
+
for p in perplexity_range:
|
| 329 |
+
for lr in learning_rate_range:
|
| 330 |
+
step += 1
|
| 331 |
+
# Actualizamos el mensaje de progreso
|
| 332 |
+
progress_text.text(f"Evaluating: Perplexity={p:.2f}, Learning Rate={lr:.2f} (Step: {step}/{total_steps})")
|
| 333 |
+
|
| 334 |
+
# Calcular la reducci贸n con TSNE
|
| 335 |
+
reducer_temp = TSNE(n_components=2, random_state=42, perplexity=p, learning_rate=lr)
|
| 336 |
+
reduced_temp = reducer_temp.fit_transform(df_combined[embedding_cols].values)
|
| 337 |
+
dfs_reduced_temp, unique_subsets_temp = split_versions(df_combined, reduced_temp)
|
| 338 |
+
|
| 339 |
+
# Calcular distancias Wasserstein
|
| 340 |
+
df_distances_temp = compute_wasserstein_distances_synthetic_individual(
|
| 341 |
+
dfs_reduced_temp["synthetic"],
|
| 342 |
+
dfs_reduced_temp["real"],
|
| 343 |
+
unique_subsets_temp["real"]
|
| 344 |
+
)
|
| 345 |
+
# Extraer los valores globales (suponemos 10 por fuente)
|
| 346 |
+
global_distances_temp = {}
|
| 347 |
+
for idx in df_distances_temp.index:
|
| 348 |
+
if idx.startswith("Global"):
|
| 349 |
+
source = idx.split("(")[1].rstrip(")")
|
| 350 |
+
global_distances_temp[source] = df_distances_temp.loc[idx].values
|
| 351 |
+
|
| 352 |
+
# Acumular datos para la regresi贸n global
|
| 353 |
+
all_x_temp = []
|
| 354 |
+
all_y_temp = []
|
| 355 |
+
for source in df_f1.columns:
|
| 356 |
+
if source in global_distances_temp:
|
| 357 |
+
x_vals_temp = global_distances_temp[source]
|
| 358 |
+
y_vals_temp = df_f1[source].values
|
| 359 |
+
all_x_temp.extend(x_vals_temp)
|
| 360 |
+
all_y_temp.extend(y_vals_temp)
|
| 361 |
+
if len(all_x_temp) == 0:
|
| 362 |
+
continue
|
| 363 |
+
all_x_temp_arr = np.array(all_x_temp).reshape(-1, 1)
|
| 364 |
+
all_y_temp_arr = np.array(all_y_temp)
|
| 365 |
+
|
| 366 |
+
model_temp = LinearRegression().fit(all_x_temp_arr, all_y_temp_arr)
|
| 367 |
+
r2_temp = model_temp.score(all_x_temp_arr, all_y_temp_arr)
|
| 368 |
+
|
| 369 |
+
# Mostrar en pantalla (o log) la tupla evaluada y el R虏 obtenido
|
| 370 |
+
st.write(f"Parameters: Perplexity={p:.2f}, Learning Rate={lr:.2f} -> R虏={r2_temp:.4f}")
|
| 371 |
+
|
| 372 |
+
if r2_temp > best_R2:
|
| 373 |
+
best_R2 = r2_temp
|
| 374 |
+
best_params = (p, lr)
|
| 375 |
+
|
| 376 |
+
progress_text.text("Optimization completed!")
|
| 377 |
+
return best_params, best_R2
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
|
| 381 |
def run_model(model_name):
|
| 382 |
embeddings = load_embeddings(model_name)
|
| 383 |
if embeddings is None:
|
| 384 |
return
|
| 385 |
+
|
| 386 |
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
|
| 387 |
df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
|
| 388 |
+
|
| 389 |
+
# Leer el CSV de f1-donut (usado para evaluar la regresi贸n)
|
| 390 |
+
try:
|
| 391 |
+
df_f1 = pd.read_csv("data/f1-donut.csv", sep=';', index_col=0)
|
| 392 |
+
except Exception as e:
|
| 393 |
+
st.error(f"Error loading f1-donut.csv: {e}")
|
| 394 |
+
return
|
| 395 |
+
|
| 396 |
st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
|
| 397 |
reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
|
| 398 |
+
|
| 399 |
+
# Opci贸n para optimizar los par谩metros TSNE
|
| 400 |
+
if reduction_method == "t-SNE":
|
| 401 |
+
if st.button("Optimize TSNE parameters", key=f"optimize_tnse_{model_name}"):
|
| 402 |
+
st.info("Running optimization, this can take a while...")
|
| 403 |
+
best_params, best_R2 = optimize_tsne_params(df_combined, embedding_cols, df_f1)
|
| 404 |
+
st.success(f"Mejores par谩metros: Perplexity = {best_params[0]:.2f}, Learning Rate = {best_params[1]:.2f} con R虏 = {best_R2:.4f}")
|
| 405 |
+
|
| 406 |
+
# Permitir al usuario ingresar manualmente los valores (o podr铆as reemplazar estos por los optimizados)
|
| 407 |
if reduction_method == "PCA":
|
| 408 |
reducer = PCA(n_components=2)
|
| 409 |
else:
|
| 410 |
+
perplexity_val = st.number_input("Perplexity", min_value=5, max_value=50, value=30, step=1, key=f"perplexity_{model_name}")
|
| 411 |
+
learning_rate_val = st.number_input("Learning Rate", min_value=10, max_value=1000, value=200, step=10, key=f"learning_rate_{model_name}")
|
| 412 |
+
reducer = TSNE(n_components=2, random_state=42, perplexity=perplexity_val, learning_rate=learning_rate_val)
|
| 413 |
+
|
| 414 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
| 415 |
dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
|
| 416 |
|
|
|
|
| 475 |
scatter_fig.legend.location = "top_right"
|
| 476 |
|
| 477 |
# Agregar HoverTool para mostrar x, y y la fuente al hacer hover
|
| 478 |
+
hover_tool = HoverTool(tooltips=[("Wass. Distance", "@x"), ("f1", "@y"), ("Subset", "@Fuente")])
|
| 479 |
scatter_fig.add_tools(hover_tool)
|
| 480 |
# --- Fin scatter plot ---
|
| 481 |
|