Spaces:
Sleeping
Sleeping
| import json | |
| import time | |
| import pandas as pd | |
| from fastapi import APIRouter, Depends, HTTPException | |
| from pydantic import BaseModel | |
| from sqlalchemy.orm import Session | |
| from sklearn.cluster import ( | |
| AffinityPropagation, | |
| AgglomerativeClustering, | |
| Birch, | |
| BisectingKMeans, | |
| DBSCAN, | |
| KMeans, | |
| MeanShift, | |
| MiniBatchKMeans, | |
| OPTICS, | |
| SpectralClustering, | |
| ) | |
| from sklearn.decomposition import PCA | |
| from sklearn.metrics import silhouette_score | |
| from sklearn.mixture import GaussianMixture | |
| from backend.app.db import get_db | |
| from backend.app.repositories.dataset_repo import get_dataset | |
| from backend.app.repositories.experiment_repo import create_experiment | |
| from backend.app.utils.ids import make_experiment_id | |
| try: | |
| import hdbscan | |
| except ImportError: | |
| hdbscan = None | |
| router = APIRouter(tags=["experiments"]) | |
| class RunRequest(BaseModel): | |
| dataset_id: str | |
| name: str | None = None | |
| algorithm: str = "kmeans" | |
| n_clusters: int = 4 | |
| feature_columns: list[str] | |
| algorithm_params: dict | None = None | |
| def build_model(algorithm: str, n_clusters: int, params: dict): | |
| if algorithm == "kmeans": | |
| return KMeans( | |
| n_clusters=params.get("n_clusters", n_clusters), | |
| n_init=10, | |
| random_state=42, | |
| ), "fit_predict" | |
| if algorithm == "mini_batch_kmeans": | |
| return MiniBatchKMeans( | |
| n_clusters=params.get("n_clusters", n_clusters), | |
| random_state=42, | |
| batch_size=params.get("batch_size", 1024), | |
| ), "fit_predict" | |
| if algorithm == "agglomerative": | |
| return AgglomerativeClustering( | |
| n_clusters=params.get("n_clusters", n_clusters), | |
| linkage=params.get("linkage", "ward"), | |
| ), "fit_predict" | |
| if algorithm == "birch": | |
| return Birch( | |
| n_clusters=params.get("n_clusters", n_clusters), | |
| threshold=params.get("threshold", 0.5), | |
| ), "fit_predict" | |
| if algorithm == "dbscan": | |
| return DBSCAN( | |
| eps=params.get("eps", 0.5), | |
| min_samples=params.get("min_samples", 5), | |
| ), "fit_predict" | |
| if algorithm == "optics": | |
| return OPTICS( | |
| min_samples=params.get("min_samples", 5), | |
| ), "fit_predict" | |
| if algorithm == "mean_shift": | |
| return MeanShift(), "fit_predict" | |
| if algorithm == "spectral": | |
| return SpectralClustering( | |
| n_clusters=params.get("n_clusters", n_clusters), | |
| random_state=42, | |
| assign_labels="kmeans", | |
| affinity=params.get("affinity", "nearest_neighbors"), | |
| ), "fit_predict" | |
| if algorithm == "gaussian_mixture": | |
| return GaussianMixture( | |
| n_components=params.get("n_components", n_clusters), | |
| random_state=42, | |
| ), "predict_after_fit" | |
| if algorithm == "affinity_propagation": | |
| return AffinityPropagation(random_state=42), "fit_predict" | |
| if algorithm == "bisecting_kmeans": | |
| return BisectingKMeans( | |
| n_clusters=params.get("n_clusters", n_clusters), | |
| random_state=42, | |
| ), "fit_predict" | |
| if algorithm == "hdbscan": | |
| if hdbscan is None: | |
| raise HTTPException(status_code=400, detail="HDBSCAN is not installed") | |
| return hdbscan.HDBSCAN( | |
| min_cluster_size=params.get("min_cluster_size", 10), | |
| min_samples=params.get("min_samples", 5), | |
| ), "fit_predict" | |
| raise HTTPException(status_code=400, detail="Unsupported algorithm") | |
| def run_experiment(req: RunRequest, db: Session = Depends(get_db)): | |
| dataset = get_dataset(db, req.dataset_id) | |
| if not dataset: | |
| raise HTTPException(status_code=404, detail="Dataset not found") | |
| if dataset.file_path.endswith(".csv"): | |
| df = pd.read_csv(dataset.file_path) | |
| else: | |
| df = pd.read_excel(dataset.file_path) | |
| if not req.feature_columns: | |
| raise HTTPException(status_code=400, detail="feature_columns is required") | |
| missing = [c for c in req.feature_columns if c not in df.columns] | |
| if missing: | |
| raise HTTPException(status_code=400, detail=f"Missing columns: {', '.join(missing)}") | |
| X = df[req.feature_columns].copy() | |
| X = pd.get_dummies(X) | |
| X = X.fillna(0) | |
| params = req.algorithm_params or {} | |
| start = time.time() | |
| model, mode = build_model(req.algorithm, req.n_clusters, params) | |
| if mode == "fit_predict": | |
| labels = model.fit_predict(X) | |
| elif mode == "predict_after_fit": | |
| model.fit(X) | |
| labels = model.predict(X) | |
| else: | |
| raise HTTPException(status_code=400, detail="Invalid model execution mode") | |
| runtime_ms = int((time.time() - start) * 1000) | |
| unique_labels = sorted(set(labels.tolist())) | |
| score = None | |
| valid_labels = [x for x in unique_labels if x != -1] | |
| if len(valid_labels) > 1 and len(valid_labels) < len(X): | |
| try: | |
| score = float(silhouette_score(X, labels)) | |
| except Exception: | |
| score = None | |
| pca = PCA(n_components=2, random_state=42) | |
| coords = pca.fit_transform(X) | |
| points = [ | |
| { | |
| "row_index": int(i), | |
| "cluster_label": int(labels[i]), | |
| "x": float(coords[i][0]), | |
| "y": float(coords[i][1]), | |
| } | |
| for i in range(len(labels)) | |
| ] | |
| cluster_sizes = {str(label): int((labels == label).sum()) for label in unique_labels} | |
| experiment_id = make_experiment_id() | |
| metrics = { | |
| "silhouette_score": score, | |
| "cluster_count": len(valid_labels) if valid_labels else len(unique_labels), | |
| "row_count": int(len(X)), | |
| "runtime_ms": runtime_ms, | |
| "noise_count": int((labels == -1).sum()) if -1 in unique_labels else 0, | |
| } | |
| summary = { | |
| "feature_columns": req.feature_columns, | |
| "cluster_sizes": cluster_sizes, | |
| "points": points, | |
| } | |
| create_experiment( | |
| db=db, | |
| id=experiment_id, | |
| dataset_id=req.dataset_id, | |
| algorithm=req.algorithm, | |
| status="completed", | |
| config_json=json.dumps(req.model_dump()), | |
| metrics_json=json.dumps(metrics), | |
| summary_json=json.dumps(summary), | |
| runtime_ms=runtime_ms, | |
| error_message=None, | |
| ) | |
| return { | |
| "experiment_id": experiment_id, | |
| "status": "completed", | |
| "silhouette_score": score, | |
| "cluster_count": metrics["cluster_count"], | |
| "cluster_sizes": cluster_sizes, | |
| "runtime_ms": runtime_ms, | |
| "noise_count": metrics["noise_count"], | |
| "points": points, | |
| } | |
| def experiment_results(experiment_id: str, db: Session = Depends(get_db)): | |
| from backend.app.repositories.experiment_repo import get_experiment | |
| exp = get_experiment(db, experiment_id) | |
| if not exp: | |
| raise HTTPException(status_code=404, detail="Experiment not found") | |
| return { | |
| "experiment_id": exp.id, | |
| "dataset_id": exp.dataset_id, | |
| "algorithm": exp.algorithm, | |
| "status": exp.status, | |
| "config": json.loads(exp.config_json) if exp.config_json else {}, | |
| "metrics": json.loads(exp.metrics_json) if exp.metrics_json else {}, | |
| "summary": json.loads(exp.summary_json) if exp.summary_json else {}, | |
| "runtime_ms": exp.runtime_ms, | |
| "error_message": exp.error_message, | |
| } | |