Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- backend/app/api/compare.py +41 -0
- backend/app/api/experiments.py +104 -0
- backend/app/api/exports.py +50 -0
- backend/app/api/runs.py +29 -0
- backend/app/db.py +34 -0
- backend/app/main.py +45 -0
- backend/app/models/experiment.py +16 -0
- backend/app/repositories/experiment_repo.py +46 -0
- backend/app/services/profiling_service.py +25 -0
backend/app/api/compare.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends, Query
|
| 4 |
+
from sqlalchemy.orm import Session
|
| 5 |
+
|
| 6 |
+
from backend.app.db import get_db
|
| 7 |
+
from backend.app.repositories.experiment_repo import (
|
| 8 |
+
get_experiments_by_ids,
|
| 9 |
+
list_experiments,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
router = APIRouter(tags=["compare"])
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@router.get("/compare")
|
| 16 |
+
def compare_experiments(
|
| 17 |
+
experiment_ids: str | None = Query(default=None),
|
| 18 |
+
db: Session = Depends(get_db),
|
| 19 |
+
):
|
| 20 |
+
if experiment_ids:
|
| 21 |
+
ids = [x.strip() for x in experiment_ids.split(",") if x.strip()]
|
| 22 |
+
experiments = get_experiments_by_ids(db, ids)
|
| 23 |
+
else:
|
| 24 |
+
experiments = list_experiments(db)
|
| 25 |
+
|
| 26 |
+
return {
|
| 27 |
+
"experiments": [
|
| 28 |
+
{
|
| 29 |
+
"experiment_id": exp.id,
|
| 30 |
+
"dataset_id": exp.dataset_id,
|
| 31 |
+
"algorithm": exp.algorithm,
|
| 32 |
+
"status": exp.status,
|
| 33 |
+
"config": json.loads(exp.config_json) if exp.config_json else {},
|
| 34 |
+
"metrics": json.loads(exp.metrics_json) if exp.metrics_json else {},
|
| 35 |
+
"summary": json.loads(exp.summary_json) if exp.summary_json else {},
|
| 36 |
+
"runtime_ms": exp.runtime_ms,
|
| 37 |
+
"error_message": exp.error_message,
|
| 38 |
+
}
|
| 39 |
+
for exp in experiments
|
| 40 |
+
]
|
| 41 |
+
}
|
backend/app/api/experiments.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import secrets
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
from sqlalchemy.orm import Session
|
| 8 |
+
from sklearn.cluster import AgglomerativeClustering, Birch, KMeans
|
| 9 |
+
from sklearn.metrics import silhouette_score
|
| 10 |
+
|
| 11 |
+
from backend.app.db import get_db
|
| 12 |
+
from backend.app.repositories.dataset_repo import get_dataset
|
| 13 |
+
from backend.app.repositories.experiment_repo import create_experiment
|
| 14 |
+
|
| 15 |
+
router = APIRouter(tags=["experiments"])
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RunRequest(BaseModel):
|
| 19 |
+
dataset_id: str
|
| 20 |
+
name: str | None = None
|
| 21 |
+
algorithm: str = "kmeans"
|
| 22 |
+
n_clusters: int = 4
|
| 23 |
+
feature_columns: list[str]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@router.post("/experiments/run")
|
| 27 |
+
def run_experiment(req: RunRequest, db: Session = Depends(get_db)):
|
| 28 |
+
dataset = get_dataset(db, req.dataset_id)
|
| 29 |
+
if not dataset:
|
| 30 |
+
raise HTTPException(status_code=404, detail="Dataset not found")
|
| 31 |
+
|
| 32 |
+
if dataset.file_path.endswith(".csv"):
|
| 33 |
+
df = pd.read_csv(dataset.file_path)
|
| 34 |
+
else:
|
| 35 |
+
df = pd.read_excel(dataset.file_path)
|
| 36 |
+
|
| 37 |
+
if not req.feature_columns:
|
| 38 |
+
raise HTTPException(status_code=400, detail="feature_columns is required")
|
| 39 |
+
|
| 40 |
+
missing = [c for c in req.feature_columns if c not in df.columns]
|
| 41 |
+
if missing:
|
| 42 |
+
raise HTTPException(status_code=400, detail=f"Missing columns: {', '.join(missing)}")
|
| 43 |
+
|
| 44 |
+
X = df[req.feature_columns].copy()
|
| 45 |
+
X = pd.get_dummies(X)
|
| 46 |
+
X = X.fillna(0)
|
| 47 |
+
|
| 48 |
+
start = time.time()
|
| 49 |
+
|
| 50 |
+
if req.algorithm == "kmeans":
|
| 51 |
+
model = KMeans(n_clusters=req.n_clusters, n_init=10, random_state=42)
|
| 52 |
+
labels = model.fit_predict(X)
|
| 53 |
+
elif req.algorithm == "agglomerative":
|
| 54 |
+
model = AgglomerativeClustering(n_clusters=req.n_clusters)
|
| 55 |
+
labels = model.fit_predict(X)
|
| 56 |
+
elif req.algorithm == "birch":
|
| 57 |
+
model = Birch(n_clusters=req.n_clusters)
|
| 58 |
+
labels = model.fit_predict(X)
|
| 59 |
+
else:
|
| 60 |
+
raise HTTPException(status_code=400, detail="Unsupported algorithm")
|
| 61 |
+
|
| 62 |
+
runtime_ms = int((time.time() - start) * 1000)
|
| 63 |
+
|
| 64 |
+
unique_labels = sorted(set(labels.tolist()))
|
| 65 |
+
score = None
|
| 66 |
+
if len(unique_labels) > 1 and len(unique_labels) < len(X):
|
| 67 |
+
score = float(silhouette_score(X, labels))
|
| 68 |
+
|
| 69 |
+
cluster_sizes = {str(label): int((labels == label).sum()) for label in unique_labels}
|
| 70 |
+
experiment_id = "exp_" + secrets.token_hex(4)
|
| 71 |
+
|
| 72 |
+
metrics = {
|
| 73 |
+
"silhouette_score": score,
|
| 74 |
+
"cluster_count": len(unique_labels),
|
| 75 |
+
"row_count": int(len(X)),
|
| 76 |
+
"runtime_ms": runtime_ms,
|
| 77 |
+
}
|
| 78 |
+
summary = {
|
| 79 |
+
"feature_columns": req.feature_columns,
|
| 80 |
+
"cluster_sizes": cluster_sizes,
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
create_experiment(
|
| 84 |
+
db=db,
|
| 85 |
+
id=experiment_id,
|
| 86 |
+
dataset_id=req.dataset_id,
|
| 87 |
+
algorithm=req.algorithm,
|
| 88 |
+
status="completed",
|
| 89 |
+
config_json=req.model_dump_json(),
|
| 90 |
+
metrics_json=pd.Series(metrics).to_json(),
|
| 91 |
+
summary_json=pd.Series(summary).to_json(),
|
| 92 |
+
runtime_ms=runtime_ms,
|
| 93 |
+
error_message=None,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
return {
|
| 97 |
+
"experiment_id": experiment_id,
|
| 98 |
+
"status": "completed",
|
| 99 |
+
"clusters": labels.tolist(),
|
| 100 |
+
"silhouette_score": score,
|
| 101 |
+
"cluster_count": len(unique_labels),
|
| 102 |
+
"cluster_sizes": cluster_sizes,
|
| 103 |
+
"runtime_ms": runtime_ms,
|
| 104 |
+
}
|
backend/app/api/exports.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 6 |
+
from fastapi.responses import StreamingResponse
|
| 7 |
+
from sqlalchemy.orm import Session
|
| 8 |
+
|
| 9 |
+
from backend.app.db import get_db
|
| 10 |
+
from backend.app.repositories.dataset_repo import get_dataset
|
| 11 |
+
from backend.app.repositories.experiment_repo import get_experiment
|
| 12 |
+
|
| 13 |
+
router = APIRouter(tags=["exports"])
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@router.get("/exports/{experiment_id}")
|
| 17 |
+
def export_experiment(experiment_id: str, db: Session = Depends(get_db)):
|
| 18 |
+
experiment = get_experiment(db, experiment_id)
|
| 19 |
+
if not experiment:
|
| 20 |
+
raise HTTPException(status_code=404, detail="Experiment not found")
|
| 21 |
+
|
| 22 |
+
dataset = get_dataset(db, experiment.dataset_id)
|
| 23 |
+
if not dataset:
|
| 24 |
+
raise HTTPException(status_code=404, detail="Dataset not found")
|
| 25 |
+
|
| 26 |
+
if dataset.file_path.endswith(".csv"):
|
| 27 |
+
df = pd.read_csv(dataset.file_path)
|
| 28 |
+
else:
|
| 29 |
+
df = pd.read_excel(dataset.file_path)
|
| 30 |
+
|
| 31 |
+
export_df = df.copy()
|
| 32 |
+
export_df["experiment_id"] = experiment.id
|
| 33 |
+
export_df["algorithm"] = experiment.algorithm
|
| 34 |
+
export_df["experiment_status"] = experiment.status
|
| 35 |
+
|
| 36 |
+
metrics = json.loads(experiment.metrics_json) if experiment.metrics_json else {}
|
| 37 |
+
for key, value in metrics.items():
|
| 38 |
+
export_df[f"metric_{key}"] = value
|
| 39 |
+
|
| 40 |
+
buffer = io.StringIO()
|
| 41 |
+
export_df.to_csv(buffer, index=False)
|
| 42 |
+
buffer.seek(0)
|
| 43 |
+
|
| 44 |
+
return StreamingResponse(
|
| 45 |
+
iter([buffer.getvalue()]),
|
| 46 |
+
media_type="text/csv",
|
| 47 |
+
headers={
|
| 48 |
+
"Content-Disposition": f"attachment; filename={experiment_id}_export.csv"
|
| 49 |
+
},
|
| 50 |
+
)
|
backend/app/api/runs.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends
|
| 4 |
+
from sqlalchemy.orm import Session
|
| 5 |
+
|
| 6 |
+
from backend.app.db import get_db
|
| 7 |
+
from backend.app.repositories.experiment_repo import list_experiments
|
| 8 |
+
|
| 9 |
+
router = APIRouter(tags=["runs"])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@router.get("/runs")
|
| 13 |
+
def get_runs(db: Session = Depends(get_db)):
|
| 14 |
+
experiments = list_experiments(db)
|
| 15 |
+
return {
|
| 16 |
+
"runs": [
|
| 17 |
+
{
|
| 18 |
+
"experiment_id": exp.id,
|
| 19 |
+
"dataset_id": exp.dataset_id,
|
| 20 |
+
"algorithm": exp.algorithm,
|
| 21 |
+
"status": exp.status,
|
| 22 |
+
"metrics": json.loads(exp.metrics_json) if exp.metrics_json else {},
|
| 23 |
+
"summary": json.loads(exp.summary_json) if exp.summary_json else {},
|
| 24 |
+
"runtime_ms": exp.runtime_ms,
|
| 25 |
+
"error_message": exp.error_message,
|
| 26 |
+
}
|
| 27 |
+
for exp in experiments
|
| 28 |
+
]
|
| 29 |
+
}
|
backend/app/db.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from sqlalchemy import create_engine
|
| 3 |
+
from sqlalchemy.orm import declarative_base, sessionmaker
|
| 4 |
+
|
| 5 |
+
DB_PATH = os.getenv("DB_PATH", "/data/clusterforge.db")
|
| 6 |
+
DATABASE_URL = f"sqlite:///{DB_PATH}"
|
| 7 |
+
|
| 8 |
+
engine = create_engine(
|
| 9 |
+
DATABASE_URL,
|
| 10 |
+
connect_args={"check_same_thread": False},
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
SessionLocal = sessionmaker(
|
| 14 |
+
autocommit=False,
|
| 15 |
+
autoflush=False,
|
| 16 |
+
bind=engine,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
Base = declarative_base()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def init_db() -> None:
|
| 23 |
+
import backend.app.models.dataset
|
| 24 |
+
import backend.app.models.experiment
|
| 25 |
+
|
| 26 |
+
Base.metadata.create_all(bind=engine)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_db():
|
| 30 |
+
db = SessionLocal()
|
| 31 |
+
try:
|
| 32 |
+
yield db
|
| 33 |
+
finally:
|
| 34 |
+
db.close()
|
backend/app/main.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
|
| 4 |
+
from backend.app.db import init_db
|
| 5 |
+
from backend.app.api.health import router as health_router
|
| 6 |
+
from backend.app.api.datasets import router as datasets_router
|
| 7 |
+
from backend.app.api.presets import router as presets_router
|
| 8 |
+
from backend.app.api.experiments import router as experiments_router
|
| 9 |
+
from backend.app.api.compare import router as compare_router
|
| 10 |
+
from backend.app.api.exports import router as exports_router
|
| 11 |
+
from backend.app.api.runs import router as runs_router
|
| 12 |
+
|
| 13 |
+
app = FastAPI(title="ClusterBuster API")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@app.on_event("startup")
|
| 17 |
+
def on_startup() -> None:
|
| 18 |
+
init_db()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
origins = [
|
| 22 |
+
"http://localhost:3000",
|
| 23 |
+
"https://cluster-buster.vercel.app",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
app.add_middleware(
|
| 27 |
+
CORSMiddleware,
|
| 28 |
+
allow_origins=origins,
|
| 29 |
+
allow_credentials=True,
|
| 30 |
+
allow_methods=["*"],
|
| 31 |
+
allow_headers=["*"],
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
app.include_router(health_router, prefix="/api")
|
| 35 |
+
app.include_router(datasets_router, prefix="/api")
|
| 36 |
+
app.include_router(presets_router, prefix="/api")
|
| 37 |
+
app.include_router(experiments_router, prefix="/api")
|
| 38 |
+
app.include_router(compare_router, prefix="/api")
|
| 39 |
+
app.include_router(exports_router, prefix="/api")
|
| 40 |
+
app.include_router(runs_router, prefix="/api")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@app.get("/")
|
| 44 |
+
def root():
|
| 45 |
+
return {"ok": True, "service": "clusterbuster-api"}
|
backend/app/models/experiment.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Column, Integer, String, Text
|
| 2 |
+
from backend.app.db import Base
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Experiment(Base):
|
| 6 |
+
__tablename__ = "experiments"
|
| 7 |
+
|
| 8 |
+
id = Column(String, primary_key=True, index=True)
|
| 9 |
+
dataset_id = Column(String, nullable=False, index=True)
|
| 10 |
+
algorithm = Column(String, nullable=False)
|
| 11 |
+
status = Column(String, nullable=False)
|
| 12 |
+
config_json = Column(Text, nullable=True)
|
| 13 |
+
metrics_json = Column(Text, nullable=True)
|
| 14 |
+
summary_json = Column(Text, nullable=True)
|
| 15 |
+
runtime_ms = Column(Integer, nullable=True)
|
| 16 |
+
error_message = Column(Text, nullable=True)
|
backend/app/repositories/experiment_repo.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy.orm import Session
|
| 2 |
+
|
| 3 |
+
from backend.app.models.experiment import Experiment
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def create_experiment(
|
| 7 |
+
db: Session,
|
| 8 |
+
id: str,
|
| 9 |
+
dataset_id: str,
|
| 10 |
+
algorithm: str,
|
| 11 |
+
status: str,
|
| 12 |
+
config_json: str | None = None,
|
| 13 |
+
metrics_json: str | None = None,
|
| 14 |
+
summary_json: str | None = None,
|
| 15 |
+
runtime_ms: int | None = None,
|
| 16 |
+
error_message: str | None = None,
|
| 17 |
+
) -> Experiment:
|
| 18 |
+
experiment = Experiment(
|
| 19 |
+
id=id,
|
| 20 |
+
dataset_id=dataset_id,
|
| 21 |
+
algorithm=algorithm,
|
| 22 |
+
status=status,
|
| 23 |
+
config_json=config_json,
|
| 24 |
+
metrics_json=metrics_json,
|
| 25 |
+
summary_json=summary_json,
|
| 26 |
+
runtime_ms=runtime_ms,
|
| 27 |
+
error_message=error_message,
|
| 28 |
+
)
|
| 29 |
+
db.add(experiment)
|
| 30 |
+
db.commit()
|
| 31 |
+
db.refresh(experiment)
|
| 32 |
+
return experiment
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_experiment(db: Session, experiment_id: str) -> Experiment | None:
|
| 36 |
+
return db.query(Experiment).filter(Experiment.id == experiment_id).first()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def list_experiments(db: Session) -> list[Experiment]:
|
| 40 |
+
return db.query(Experiment).order_by(Experiment.id.desc()).all()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_experiments_by_ids(db: Session, experiment_ids: list[str]) -> list[Experiment]:
|
| 44 |
+
if not experiment_ids:
|
| 45 |
+
return []
|
| 46 |
+
return db.query(Experiment).filter(Experiment.id.in_(experiment_ids)).all()
|
backend/app/services/profiling_service.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def profile_dataframe(df):
|
| 2 |
+
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
|
| 3 |
+
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
|
| 4 |
+
|
| 5 |
+
recommended = []
|
| 6 |
+
|
| 7 |
+
if len(numeric_cols) > 0:
|
| 8 |
+
recommended.append("kmeans")
|
| 9 |
+
recommended.append("birch")
|
| 10 |
+
|
| 11 |
+
if len(categorical_cols) > 0:
|
| 12 |
+
recommended.append("agglomerative")
|
| 13 |
+
|
| 14 |
+
return {
|
| 15 |
+
"columns": [
|
| 16 |
+
{
|
| 17 |
+
"name": col,
|
| 18 |
+
"inferred_type": str(df[col].dtype),
|
| 19 |
+
"missing_pct": float(df[col].isna().mean()),
|
| 20 |
+
"cardinality": int(df[col].nunique())
|
| 21 |
+
}
|
| 22 |
+
for col in df.columns
|
| 23 |
+
],
|
| 24 |
+
"recommended_algorithms": recommended
|
| 25 |
+
}
|