Adisri99 commited on
Commit
cfd01cb
·
verified ·
1 Parent(s): 2be61a8

Upload 9 files

Browse files
backend/app/api/compare.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from fastapi import APIRouter, Depends, Query
4
+ from sqlalchemy.orm import Session
5
+
6
+ from backend.app.db import get_db
7
+ from backend.app.repositories.experiment_repo import (
8
+ get_experiments_by_ids,
9
+ list_experiments,
10
+ )
11
+
12
+ router = APIRouter(tags=["compare"])
13
+
14
+
15
+ @router.get("/compare")
16
+ def compare_experiments(
17
+ experiment_ids: str | None = Query(default=None),
18
+ db: Session = Depends(get_db),
19
+ ):
20
+ if experiment_ids:
21
+ ids = [x.strip() for x in experiment_ids.split(",") if x.strip()]
22
+ experiments = get_experiments_by_ids(db, ids)
23
+ else:
24
+ experiments = list_experiments(db)
25
+
26
+ return {
27
+ "experiments": [
28
+ {
29
+ "experiment_id": exp.id,
30
+ "dataset_id": exp.dataset_id,
31
+ "algorithm": exp.algorithm,
32
+ "status": exp.status,
33
+ "config": json.loads(exp.config_json) if exp.config_json else {},
34
+ "metrics": json.loads(exp.metrics_json) if exp.metrics_json else {},
35
+ "summary": json.loads(exp.summary_json) if exp.summary_json else {},
36
+ "runtime_ms": exp.runtime_ms,
37
+ "error_message": exp.error_message,
38
+ }
39
+ for exp in experiments
40
+ ]
41
+ }
backend/app/api/experiments.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import secrets
2
+ import time
3
+
4
+ import pandas as pd
5
+ from fastapi import APIRouter, Depends, HTTPException
6
+ from pydantic import BaseModel
7
+ from sqlalchemy.orm import Session
8
+ from sklearn.cluster import AgglomerativeClustering, Birch, KMeans
9
+ from sklearn.metrics import silhouette_score
10
+
11
+ from backend.app.db import get_db
12
+ from backend.app.repositories.dataset_repo import get_dataset
13
+ from backend.app.repositories.experiment_repo import create_experiment
14
+
15
+ router = APIRouter(tags=["experiments"])
16
+
17
+
18
+ class RunRequest(BaseModel):
19
+ dataset_id: str
20
+ name: str | None = None
21
+ algorithm: str = "kmeans"
22
+ n_clusters: int = 4
23
+ feature_columns: list[str]
24
+
25
+
26
+ @router.post("/experiments/run")
27
+ def run_experiment(req: RunRequest, db: Session = Depends(get_db)):
28
+ dataset = get_dataset(db, req.dataset_id)
29
+ if not dataset:
30
+ raise HTTPException(status_code=404, detail="Dataset not found")
31
+
32
+ if dataset.file_path.endswith(".csv"):
33
+ df = pd.read_csv(dataset.file_path)
34
+ else:
35
+ df = pd.read_excel(dataset.file_path)
36
+
37
+ if not req.feature_columns:
38
+ raise HTTPException(status_code=400, detail="feature_columns is required")
39
+
40
+ missing = [c for c in req.feature_columns if c not in df.columns]
41
+ if missing:
42
+ raise HTTPException(status_code=400, detail=f"Missing columns: {', '.join(missing)}")
43
+
44
+ X = df[req.feature_columns].copy()
45
+ X = pd.get_dummies(X)
46
+ X = X.fillna(0)
47
+
48
+ start = time.time()
49
+
50
+ if req.algorithm == "kmeans":
51
+ model = KMeans(n_clusters=req.n_clusters, n_init=10, random_state=42)
52
+ labels = model.fit_predict(X)
53
+ elif req.algorithm == "agglomerative":
54
+ model = AgglomerativeClustering(n_clusters=req.n_clusters)
55
+ labels = model.fit_predict(X)
56
+ elif req.algorithm == "birch":
57
+ model = Birch(n_clusters=req.n_clusters)
58
+ labels = model.fit_predict(X)
59
+ else:
60
+ raise HTTPException(status_code=400, detail="Unsupported algorithm")
61
+
62
+ runtime_ms = int((time.time() - start) * 1000)
63
+
64
+ unique_labels = sorted(set(labels.tolist()))
65
+ score = None
66
+ if len(unique_labels) > 1 and len(unique_labels) < len(X):
67
+ score = float(silhouette_score(X, labels))
68
+
69
+ cluster_sizes = {str(label): int((labels == label).sum()) for label in unique_labels}
70
+ experiment_id = "exp_" + secrets.token_hex(4)
71
+
72
+ metrics = {
73
+ "silhouette_score": score,
74
+ "cluster_count": len(unique_labels),
75
+ "row_count": int(len(X)),
76
+ "runtime_ms": runtime_ms,
77
+ }
78
+ summary = {
79
+ "feature_columns": req.feature_columns,
80
+ "cluster_sizes": cluster_sizes,
81
+ }
82
+
83
+ create_experiment(
84
+ db=db,
85
+ id=experiment_id,
86
+ dataset_id=req.dataset_id,
87
+ algorithm=req.algorithm,
88
+ status="completed",
89
+ config_json=req.model_dump_json(),
90
+ metrics_json=pd.Series(metrics).to_json(),
91
+ summary_json=pd.Series(summary).to_json(),
92
+ runtime_ms=runtime_ms,
93
+ error_message=None,
94
+ )
95
+
96
+ return {
97
+ "experiment_id": experiment_id,
98
+ "status": "completed",
99
+ "clusters": labels.tolist(),
100
+ "silhouette_score": score,
101
+ "cluster_count": len(unique_labels),
102
+ "cluster_sizes": cluster_sizes,
103
+ "runtime_ms": runtime_ms,
104
+ }
backend/app/api/exports.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+
4
+ import pandas as pd
5
+ from fastapi import APIRouter, Depends, HTTPException
6
+ from fastapi.responses import StreamingResponse
7
+ from sqlalchemy.orm import Session
8
+
9
+ from backend.app.db import get_db
10
+ from backend.app.repositories.dataset_repo import get_dataset
11
+ from backend.app.repositories.experiment_repo import get_experiment
12
+
13
+ router = APIRouter(tags=["exports"])
14
+
15
+
16
+ @router.get("/exports/{experiment_id}")
17
+ def export_experiment(experiment_id: str, db: Session = Depends(get_db)):
18
+ experiment = get_experiment(db, experiment_id)
19
+ if not experiment:
20
+ raise HTTPException(status_code=404, detail="Experiment not found")
21
+
22
+ dataset = get_dataset(db, experiment.dataset_id)
23
+ if not dataset:
24
+ raise HTTPException(status_code=404, detail="Dataset not found")
25
+
26
+ if dataset.file_path.endswith(".csv"):
27
+ df = pd.read_csv(dataset.file_path)
28
+ else:
29
+ df = pd.read_excel(dataset.file_path)
30
+
31
+ export_df = df.copy()
32
+ export_df["experiment_id"] = experiment.id
33
+ export_df["algorithm"] = experiment.algorithm
34
+ export_df["experiment_status"] = experiment.status
35
+
36
+ metrics = json.loads(experiment.metrics_json) if experiment.metrics_json else {}
37
+ for key, value in metrics.items():
38
+ export_df[f"metric_{key}"] = value
39
+
40
+ buffer = io.StringIO()
41
+ export_df.to_csv(buffer, index=False)
42
+ buffer.seek(0)
43
+
44
+ return StreamingResponse(
45
+ iter([buffer.getvalue()]),
46
+ media_type="text/csv",
47
+ headers={
48
+ "Content-Disposition": f"attachment; filename={experiment_id}_export.csv"
49
+ },
50
+ )
backend/app/api/runs.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from fastapi import APIRouter, Depends
4
+ from sqlalchemy.orm import Session
5
+
6
+ from backend.app.db import get_db
7
+ from backend.app.repositories.experiment_repo import list_experiments
8
+
9
+ router = APIRouter(tags=["runs"])
10
+
11
+
12
+ @router.get("/runs")
13
+ def get_runs(db: Session = Depends(get_db)):
14
+ experiments = list_experiments(db)
15
+ return {
16
+ "runs": [
17
+ {
18
+ "experiment_id": exp.id,
19
+ "dataset_id": exp.dataset_id,
20
+ "algorithm": exp.algorithm,
21
+ "status": exp.status,
22
+ "metrics": json.loads(exp.metrics_json) if exp.metrics_json else {},
23
+ "summary": json.loads(exp.summary_json) if exp.summary_json else {},
24
+ "runtime_ms": exp.runtime_ms,
25
+ "error_message": exp.error_message,
26
+ }
27
+ for exp in experiments
28
+ ]
29
+ }
backend/app/db.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sqlalchemy import create_engine
3
+ from sqlalchemy.orm import declarative_base, sessionmaker
4
+
5
+ DB_PATH = os.getenv("DB_PATH", "/data/clusterforge.db")
6
+ DATABASE_URL = f"sqlite:///{DB_PATH}"
7
+
8
+ engine = create_engine(
9
+ DATABASE_URL,
10
+ connect_args={"check_same_thread": False},
11
+ )
12
+
13
+ SessionLocal = sessionmaker(
14
+ autocommit=False,
15
+ autoflush=False,
16
+ bind=engine,
17
+ )
18
+
19
+ Base = declarative_base()
20
+
21
+
22
+ def init_db() -> None:
23
+ import backend.app.models.dataset
24
+ import backend.app.models.experiment
25
+
26
+ Base.metadata.create_all(bind=engine)
27
+
28
+
29
+ def get_db():
30
+ db = SessionLocal()
31
+ try:
32
+ yield db
33
+ finally:
34
+ db.close()
backend/app/main.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+
4
+ from backend.app.db import init_db
5
+ from backend.app.api.health import router as health_router
6
+ from backend.app.api.datasets import router as datasets_router
7
+ from backend.app.api.presets import router as presets_router
8
+ from backend.app.api.experiments import router as experiments_router
9
+ from backend.app.api.compare import router as compare_router
10
+ from backend.app.api.exports import router as exports_router
11
+ from backend.app.api.runs import router as runs_router
12
+
13
+ app = FastAPI(title="ClusterBuster API")
14
+
15
+
16
+ @app.on_event("startup")
17
+ def on_startup() -> None:
18
+ init_db()
19
+
20
+
21
+ origins = [
22
+ "http://localhost:3000",
23
+ "https://cluster-buster.vercel.app",
24
+ ]
25
+
26
+ app.add_middleware(
27
+ CORSMiddleware,
28
+ allow_origins=origins,
29
+ allow_credentials=True,
30
+ allow_methods=["*"],
31
+ allow_headers=["*"],
32
+ )
33
+
34
+ app.include_router(health_router, prefix="/api")
35
+ app.include_router(datasets_router, prefix="/api")
36
+ app.include_router(presets_router, prefix="/api")
37
+ app.include_router(experiments_router, prefix="/api")
38
+ app.include_router(compare_router, prefix="/api")
39
+ app.include_router(exports_router, prefix="/api")
40
+ app.include_router(runs_router, prefix="/api")
41
+
42
+
43
+ @app.get("/")
44
+ def root():
45
+ return {"ok": True, "service": "clusterbuster-api"}
backend/app/models/experiment.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, Text
2
+ from backend.app.db import Base
3
+
4
+
5
+ class Experiment(Base):
6
+ __tablename__ = "experiments"
7
+
8
+ id = Column(String, primary_key=True, index=True)
9
+ dataset_id = Column(String, nullable=False, index=True)
10
+ algorithm = Column(String, nullable=False)
11
+ status = Column(String, nullable=False)
12
+ config_json = Column(Text, nullable=True)
13
+ metrics_json = Column(Text, nullable=True)
14
+ summary_json = Column(Text, nullable=True)
15
+ runtime_ms = Column(Integer, nullable=True)
16
+ error_message = Column(Text, nullable=True)
backend/app/repositories/experiment_repo.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy.orm import Session
2
+
3
+ from backend.app.models.experiment import Experiment
4
+
5
+
6
+ def create_experiment(
7
+ db: Session,
8
+ id: str,
9
+ dataset_id: str,
10
+ algorithm: str,
11
+ status: str,
12
+ config_json: str | None = None,
13
+ metrics_json: str | None = None,
14
+ summary_json: str | None = None,
15
+ runtime_ms: int | None = None,
16
+ error_message: str | None = None,
17
+ ) -> Experiment:
18
+ experiment = Experiment(
19
+ id=id,
20
+ dataset_id=dataset_id,
21
+ algorithm=algorithm,
22
+ status=status,
23
+ config_json=config_json,
24
+ metrics_json=metrics_json,
25
+ summary_json=summary_json,
26
+ runtime_ms=runtime_ms,
27
+ error_message=error_message,
28
+ )
29
+ db.add(experiment)
30
+ db.commit()
31
+ db.refresh(experiment)
32
+ return experiment
33
+
34
+
35
+ def get_experiment(db: Session, experiment_id: str) -> Experiment | None:
36
+ return db.query(Experiment).filter(Experiment.id == experiment_id).first()
37
+
38
+
39
+ def list_experiments(db: Session) -> list[Experiment]:
40
+ return db.query(Experiment).order_by(Experiment.id.desc()).all()
41
+
42
+
43
+ def get_experiments_by_ids(db: Session, experiment_ids: list[str]) -> list[Experiment]:
44
+ if not experiment_ids:
45
+ return []
46
+ return db.query(Experiment).filter(Experiment.id.in_(experiment_ids)).all()
backend/app/services/profiling_service.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def profile_dataframe(df):
2
+ numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
3
+ categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
4
+
5
+ recommended = []
6
+
7
+ if len(numeric_cols) > 0:
8
+ recommended.append("kmeans")
9
+ recommended.append("birch")
10
+
11
+ if len(categorical_cols) > 0:
12
+ recommended.append("agglomerative")
13
+
14
+ return {
15
+ "columns": [
16
+ {
17
+ "name": col,
18
+ "inferred_type": str(df[col].dtype),
19
+ "missing_pct": float(df[col].isna().mean()),
20
+ "cardinality": int(df[col].nunique())
21
+ }
22
+ for col in df.columns
23
+ ],
24
+ "recommended_algorithms": recommended
25
+ }