import json import time import pandas as pd from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel from sqlalchemy.orm import Session from sklearn.cluster import ( AffinityPropagation, AgglomerativeClustering, Birch, BisectingKMeans, DBSCAN, KMeans, MeanShift, MiniBatchKMeans, OPTICS, SpectralClustering, ) from sklearn.decomposition import PCA from sklearn.metrics import silhouette_score from sklearn.mixture import GaussianMixture from backend.app.db import get_db from backend.app.repositories.dataset_repo import get_dataset from backend.app.repositories.experiment_repo import create_experiment from backend.app.utils.ids import make_experiment_id try: import hdbscan except ImportError: hdbscan = None router = APIRouter(tags=["experiments"]) class RunRequest(BaseModel): dataset_id: str name: str | None = None algorithm: str = "kmeans" n_clusters: int = 4 feature_columns: list[str] algorithm_params: dict | None = None def build_model(algorithm: str, n_clusters: int, params: dict): if algorithm == "kmeans": return KMeans( n_clusters=params.get("n_clusters", n_clusters), n_init=10, random_state=42, ), "fit_predict" if algorithm == "mini_batch_kmeans": return MiniBatchKMeans( n_clusters=params.get("n_clusters", n_clusters), random_state=42, batch_size=params.get("batch_size", 1024), ), "fit_predict" if algorithm == "agglomerative": return AgglomerativeClustering( n_clusters=params.get("n_clusters", n_clusters), linkage=params.get("linkage", "ward"), ), "fit_predict" if algorithm == "birch": return Birch( n_clusters=params.get("n_clusters", n_clusters), threshold=params.get("threshold", 0.5), ), "fit_predict" if algorithm == "dbscan": return DBSCAN( eps=params.get("eps", 0.5), min_samples=params.get("min_samples", 5), ), "fit_predict" if algorithm == "optics": return OPTICS( min_samples=params.get("min_samples", 5), ), "fit_predict" if algorithm == "mean_shift": return MeanShift(), "fit_predict" if algorithm == "spectral": return SpectralClustering( n_clusters=params.get("n_clusters", n_clusters), random_state=42, assign_labels="kmeans", affinity=params.get("affinity", "nearest_neighbors"), ), "fit_predict" if algorithm == "gaussian_mixture": return GaussianMixture( n_components=params.get("n_components", n_clusters), random_state=42, ), "predict_after_fit" if algorithm == "affinity_propagation": return AffinityPropagation(random_state=42), "fit_predict" if algorithm == "bisecting_kmeans": return BisectingKMeans( n_clusters=params.get("n_clusters", n_clusters), random_state=42, ), "fit_predict" if algorithm == "hdbscan": if hdbscan is None: raise HTTPException(status_code=400, detail="HDBSCAN is not installed") return hdbscan.HDBSCAN( min_cluster_size=params.get("min_cluster_size", 10), min_samples=params.get("min_samples", 5), ), "fit_predict" raise HTTPException(status_code=400, detail="Unsupported algorithm") @router.post("/experiments/run") def run_experiment(req: RunRequest, db: Session = Depends(get_db)): dataset = get_dataset(db, req.dataset_id) if not dataset: raise HTTPException(status_code=404, detail="Dataset not found") if dataset.file_path.endswith(".csv"): df = pd.read_csv(dataset.file_path) else: df = pd.read_excel(dataset.file_path) if not req.feature_columns: raise HTTPException(status_code=400, detail="feature_columns is required") missing = [c for c in req.feature_columns if c not in df.columns] if missing: raise HTTPException(status_code=400, detail=f"Missing columns: {', '.join(missing)}") X = df[req.feature_columns].copy() X = pd.get_dummies(X) X = X.fillna(0) params = req.algorithm_params or {} start = time.time() model, mode = build_model(req.algorithm, req.n_clusters, params) if mode == "fit_predict": labels = model.fit_predict(X) elif mode == "predict_after_fit": model.fit(X) labels = model.predict(X) else: raise HTTPException(status_code=400, detail="Invalid model execution mode") runtime_ms = int((time.time() - start) * 1000) unique_labels = sorted(set(labels.tolist())) score = None valid_labels = [x for x in unique_labels if x != -1] if len(valid_labels) > 1 and len(valid_labels) < len(X): try: score = float(silhouette_score(X, labels)) except Exception: score = None pca = PCA(n_components=2, random_state=42) coords = pca.fit_transform(X) points = [ { "row_index": int(i), "cluster_label": int(labels[i]), "x": float(coords[i][0]), "y": float(coords[i][1]), } for i in range(len(labels)) ] cluster_sizes = {str(label): int((labels == label).sum()) for label in unique_labels} experiment_id = make_experiment_id() metrics = { "silhouette_score": score, "cluster_count": len(valid_labels) if valid_labels else len(unique_labels), "row_count": int(len(X)), "runtime_ms": runtime_ms, "noise_count": int((labels == -1).sum()) if -1 in unique_labels else 0, } summary = { "feature_columns": req.feature_columns, "cluster_sizes": cluster_sizes, "points": points, } create_experiment( db=db, id=experiment_id, dataset_id=req.dataset_id, algorithm=req.algorithm, status="completed", config_json=json.dumps(req.model_dump()), metrics_json=json.dumps(metrics), summary_json=json.dumps(summary), runtime_ms=runtime_ms, error_message=None, ) return { "experiment_id": experiment_id, "status": "completed", "silhouette_score": score, "cluster_count": metrics["cluster_count"], "cluster_sizes": cluster_sizes, "runtime_ms": runtime_ms, "noise_count": metrics["noise_count"], "points": points, } @router.get("/experiments/{experiment_id}/results") def experiment_results(experiment_id: str, db: Session = Depends(get_db)): from backend.app.repositories.experiment_repo import get_experiment exp = get_experiment(db, experiment_id) if not exp: raise HTTPException(status_code=404, detail="Experiment not found") return { "experiment_id": exp.id, "dataset_id": exp.dataset_id, "algorithm": exp.algorithm, "status": exp.status, "config": json.loads(exp.config_json) if exp.config_json else {}, "metrics": json.loads(exp.metrics_json) if exp.metrics_json else {}, "summary": json.loads(exp.summary_json) if exp.summary_json else {}, "runtime_ms": exp.runtime_ms, "error_message": exp.error_message, }