Spaces:
Sleeping
Sleeping
Delete backend
Browse files- backend/app/api/compare.py +0 -41
- backend/app/api/experiments.py +0 -104
- backend/app/api/exports.py +0 -50
- backend/app/api/runs.py +0 -29
- backend/app/db.py +0 -34
- backend/app/main.py +0 -45
- backend/app/models/experiment.py +0 -16
- backend/app/repositories/experiment_repo.py +0 -46
- backend/app/services/profiling_service.py +0 -25
backend/app/api/compare.py
DELETED
|
@@ -1,41 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
|
| 3 |
-
from fastapi import APIRouter, Depends, Query
|
| 4 |
-
from sqlalchemy.orm import Session
|
| 5 |
-
|
| 6 |
-
from backend.app.db import get_db
|
| 7 |
-
from backend.app.repositories.experiment_repo import (
|
| 8 |
-
get_experiments_by_ids,
|
| 9 |
-
list_experiments,
|
| 10 |
-
)
|
| 11 |
-
|
| 12 |
-
router = APIRouter(tags=["compare"])
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
@router.get("/compare")
|
| 16 |
-
def compare_experiments(
|
| 17 |
-
experiment_ids: str | None = Query(default=None),
|
| 18 |
-
db: Session = Depends(get_db),
|
| 19 |
-
):
|
| 20 |
-
if experiment_ids:
|
| 21 |
-
ids = [x.strip() for x in experiment_ids.split(",") if x.strip()]
|
| 22 |
-
experiments = get_experiments_by_ids(db, ids)
|
| 23 |
-
else:
|
| 24 |
-
experiments = list_experiments(db)
|
| 25 |
-
|
| 26 |
-
return {
|
| 27 |
-
"experiments": [
|
| 28 |
-
{
|
| 29 |
-
"experiment_id": exp.id,
|
| 30 |
-
"dataset_id": exp.dataset_id,
|
| 31 |
-
"algorithm": exp.algorithm,
|
| 32 |
-
"status": exp.status,
|
| 33 |
-
"config": json.loads(exp.config_json) if exp.config_json else {},
|
| 34 |
-
"metrics": json.loads(exp.metrics_json) if exp.metrics_json else {},
|
| 35 |
-
"summary": json.loads(exp.summary_json) if exp.summary_json else {},
|
| 36 |
-
"runtime_ms": exp.runtime_ms,
|
| 37 |
-
"error_message": exp.error_message,
|
| 38 |
-
}
|
| 39 |
-
for exp in experiments
|
| 40 |
-
]
|
| 41 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/api/experiments.py
DELETED
|
@@ -1,104 +0,0 @@
|
|
| 1 |
-
import secrets
|
| 2 |
-
import time
|
| 3 |
-
|
| 4 |
-
import pandas as pd
|
| 5 |
-
from fastapi import APIRouter, Depends, HTTPException
|
| 6 |
-
from pydantic import BaseModel
|
| 7 |
-
from sqlalchemy.orm import Session
|
| 8 |
-
from sklearn.cluster import AgglomerativeClustering, Birch, KMeans
|
| 9 |
-
from sklearn.metrics import silhouette_score
|
| 10 |
-
|
| 11 |
-
from backend.app.db import get_db
|
| 12 |
-
from backend.app.repositories.dataset_repo import get_dataset
|
| 13 |
-
from backend.app.repositories.experiment_repo import create_experiment
|
| 14 |
-
|
| 15 |
-
router = APIRouter(tags=["experiments"])
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
class RunRequest(BaseModel):
|
| 19 |
-
dataset_id: str
|
| 20 |
-
name: str | None = None
|
| 21 |
-
algorithm: str = "kmeans"
|
| 22 |
-
n_clusters: int = 4
|
| 23 |
-
feature_columns: list[str]
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
@router.post("/experiments/run")
|
| 27 |
-
def run_experiment(req: RunRequest, db: Session = Depends(get_db)):
|
| 28 |
-
dataset = get_dataset(db, req.dataset_id)
|
| 29 |
-
if not dataset:
|
| 30 |
-
raise HTTPException(status_code=404, detail="Dataset not found")
|
| 31 |
-
|
| 32 |
-
if dataset.file_path.endswith(".csv"):
|
| 33 |
-
df = pd.read_csv(dataset.file_path)
|
| 34 |
-
else:
|
| 35 |
-
df = pd.read_excel(dataset.file_path)
|
| 36 |
-
|
| 37 |
-
if not req.feature_columns:
|
| 38 |
-
raise HTTPException(status_code=400, detail="feature_columns is required")
|
| 39 |
-
|
| 40 |
-
missing = [c for c in req.feature_columns if c not in df.columns]
|
| 41 |
-
if missing:
|
| 42 |
-
raise HTTPException(status_code=400, detail=f"Missing columns: {', '.join(missing)}")
|
| 43 |
-
|
| 44 |
-
X = df[req.feature_columns].copy()
|
| 45 |
-
X = pd.get_dummies(X)
|
| 46 |
-
X = X.fillna(0)
|
| 47 |
-
|
| 48 |
-
start = time.time()
|
| 49 |
-
|
| 50 |
-
if req.algorithm == "kmeans":
|
| 51 |
-
model = KMeans(n_clusters=req.n_clusters, n_init=10, random_state=42)
|
| 52 |
-
labels = model.fit_predict(X)
|
| 53 |
-
elif req.algorithm == "agglomerative":
|
| 54 |
-
model = AgglomerativeClustering(n_clusters=req.n_clusters)
|
| 55 |
-
labels = model.fit_predict(X)
|
| 56 |
-
elif req.algorithm == "birch":
|
| 57 |
-
model = Birch(n_clusters=req.n_clusters)
|
| 58 |
-
labels = model.fit_predict(X)
|
| 59 |
-
else:
|
| 60 |
-
raise HTTPException(status_code=400, detail="Unsupported algorithm")
|
| 61 |
-
|
| 62 |
-
runtime_ms = int((time.time() - start) * 1000)
|
| 63 |
-
|
| 64 |
-
unique_labels = sorted(set(labels.tolist()))
|
| 65 |
-
score = None
|
| 66 |
-
if len(unique_labels) > 1 and len(unique_labels) < len(X):
|
| 67 |
-
score = float(silhouette_score(X, labels))
|
| 68 |
-
|
| 69 |
-
cluster_sizes = {str(label): int((labels == label).sum()) for label in unique_labels}
|
| 70 |
-
experiment_id = "exp_" + secrets.token_hex(4)
|
| 71 |
-
|
| 72 |
-
metrics = {
|
| 73 |
-
"silhouette_score": score,
|
| 74 |
-
"cluster_count": len(unique_labels),
|
| 75 |
-
"row_count": int(len(X)),
|
| 76 |
-
"runtime_ms": runtime_ms,
|
| 77 |
-
}
|
| 78 |
-
summary = {
|
| 79 |
-
"feature_columns": req.feature_columns,
|
| 80 |
-
"cluster_sizes": cluster_sizes,
|
| 81 |
-
}
|
| 82 |
-
|
| 83 |
-
create_experiment(
|
| 84 |
-
db=db,
|
| 85 |
-
id=experiment_id,
|
| 86 |
-
dataset_id=req.dataset_id,
|
| 87 |
-
algorithm=req.algorithm,
|
| 88 |
-
status="completed",
|
| 89 |
-
config_json=req.model_dump_json(),
|
| 90 |
-
metrics_json=pd.Series(metrics).to_json(),
|
| 91 |
-
summary_json=pd.Series(summary).to_json(),
|
| 92 |
-
runtime_ms=runtime_ms,
|
| 93 |
-
error_message=None,
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
return {
|
| 97 |
-
"experiment_id": experiment_id,
|
| 98 |
-
"status": "completed",
|
| 99 |
-
"clusters": labels.tolist(),
|
| 100 |
-
"silhouette_score": score,
|
| 101 |
-
"cluster_count": len(unique_labels),
|
| 102 |
-
"cluster_sizes": cluster_sizes,
|
| 103 |
-
"runtime_ms": runtime_ms,
|
| 104 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/api/exports.py
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
import io
|
| 2 |
-
import json
|
| 3 |
-
|
| 4 |
-
import pandas as pd
|
| 5 |
-
from fastapi import APIRouter, Depends, HTTPException
|
| 6 |
-
from fastapi.responses import StreamingResponse
|
| 7 |
-
from sqlalchemy.orm import Session
|
| 8 |
-
|
| 9 |
-
from backend.app.db import get_db
|
| 10 |
-
from backend.app.repositories.dataset_repo import get_dataset
|
| 11 |
-
from backend.app.repositories.experiment_repo import get_experiment
|
| 12 |
-
|
| 13 |
-
router = APIRouter(tags=["exports"])
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
@router.get("/exports/{experiment_id}")
|
| 17 |
-
def export_experiment(experiment_id: str, db: Session = Depends(get_db)):
|
| 18 |
-
experiment = get_experiment(db, experiment_id)
|
| 19 |
-
if not experiment:
|
| 20 |
-
raise HTTPException(status_code=404, detail="Experiment not found")
|
| 21 |
-
|
| 22 |
-
dataset = get_dataset(db, experiment.dataset_id)
|
| 23 |
-
if not dataset:
|
| 24 |
-
raise HTTPException(status_code=404, detail="Dataset not found")
|
| 25 |
-
|
| 26 |
-
if dataset.file_path.endswith(".csv"):
|
| 27 |
-
df = pd.read_csv(dataset.file_path)
|
| 28 |
-
else:
|
| 29 |
-
df = pd.read_excel(dataset.file_path)
|
| 30 |
-
|
| 31 |
-
export_df = df.copy()
|
| 32 |
-
export_df["experiment_id"] = experiment.id
|
| 33 |
-
export_df["algorithm"] = experiment.algorithm
|
| 34 |
-
export_df["experiment_status"] = experiment.status
|
| 35 |
-
|
| 36 |
-
metrics = json.loads(experiment.metrics_json) if experiment.metrics_json else {}
|
| 37 |
-
for key, value in metrics.items():
|
| 38 |
-
export_df[f"metric_{key}"] = value
|
| 39 |
-
|
| 40 |
-
buffer = io.StringIO()
|
| 41 |
-
export_df.to_csv(buffer, index=False)
|
| 42 |
-
buffer.seek(0)
|
| 43 |
-
|
| 44 |
-
return StreamingResponse(
|
| 45 |
-
iter([buffer.getvalue()]),
|
| 46 |
-
media_type="text/csv",
|
| 47 |
-
headers={
|
| 48 |
-
"Content-Disposition": f"attachment; filename={experiment_id}_export.csv"
|
| 49 |
-
},
|
| 50 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/api/runs.py
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
|
| 3 |
-
from fastapi import APIRouter, Depends
|
| 4 |
-
from sqlalchemy.orm import Session
|
| 5 |
-
|
| 6 |
-
from backend.app.db import get_db
|
| 7 |
-
from backend.app.repositories.experiment_repo import list_experiments
|
| 8 |
-
|
| 9 |
-
router = APIRouter(tags=["runs"])
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
@router.get("/runs")
|
| 13 |
-
def get_runs(db: Session = Depends(get_db)):
|
| 14 |
-
experiments = list_experiments(db)
|
| 15 |
-
return {
|
| 16 |
-
"runs": [
|
| 17 |
-
{
|
| 18 |
-
"experiment_id": exp.id,
|
| 19 |
-
"dataset_id": exp.dataset_id,
|
| 20 |
-
"algorithm": exp.algorithm,
|
| 21 |
-
"status": exp.status,
|
| 22 |
-
"metrics": json.loads(exp.metrics_json) if exp.metrics_json else {},
|
| 23 |
-
"summary": json.loads(exp.summary_json) if exp.summary_json else {},
|
| 24 |
-
"runtime_ms": exp.runtime_ms,
|
| 25 |
-
"error_message": exp.error_message,
|
| 26 |
-
}
|
| 27 |
-
for exp in experiments
|
| 28 |
-
]
|
| 29 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/db.py
DELETED
|
@@ -1,34 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from sqlalchemy import create_engine
|
| 3 |
-
from sqlalchemy.orm import declarative_base, sessionmaker
|
| 4 |
-
|
| 5 |
-
DB_PATH = os.getenv("DB_PATH", "/data/clusterforge.db")
|
| 6 |
-
DATABASE_URL = f"sqlite:///{DB_PATH}"
|
| 7 |
-
|
| 8 |
-
engine = create_engine(
|
| 9 |
-
DATABASE_URL,
|
| 10 |
-
connect_args={"check_same_thread": False},
|
| 11 |
-
)
|
| 12 |
-
|
| 13 |
-
SessionLocal = sessionmaker(
|
| 14 |
-
autocommit=False,
|
| 15 |
-
autoflush=False,
|
| 16 |
-
bind=engine,
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
Base = declarative_base()
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def init_db() -> None:
|
| 23 |
-
import backend.app.models.dataset
|
| 24 |
-
import backend.app.models.experiment
|
| 25 |
-
|
| 26 |
-
Base.metadata.create_all(bind=engine)
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def get_db():
|
| 30 |
-
db = SessionLocal()
|
| 31 |
-
try:
|
| 32 |
-
yield db
|
| 33 |
-
finally:
|
| 34 |
-
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/main.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
| 2 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
-
|
| 4 |
-
from backend.app.db import init_db
|
| 5 |
-
from backend.app.api.health import router as health_router
|
| 6 |
-
from backend.app.api.datasets import router as datasets_router
|
| 7 |
-
from backend.app.api.presets import router as presets_router
|
| 8 |
-
from backend.app.api.experiments import router as experiments_router
|
| 9 |
-
from backend.app.api.compare import router as compare_router
|
| 10 |
-
from backend.app.api.exports import router as exports_router
|
| 11 |
-
from backend.app.api.runs import router as runs_router
|
| 12 |
-
|
| 13 |
-
app = FastAPI(title="ClusterBuster API")
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
@app.on_event("startup")
|
| 17 |
-
def on_startup() -> None:
|
| 18 |
-
init_db()
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
origins = [
|
| 22 |
-
"http://localhost:3000",
|
| 23 |
-
"https://cluster-buster.vercel.app",
|
| 24 |
-
]
|
| 25 |
-
|
| 26 |
-
app.add_middleware(
|
| 27 |
-
CORSMiddleware,
|
| 28 |
-
allow_origins=origins,
|
| 29 |
-
allow_credentials=True,
|
| 30 |
-
allow_methods=["*"],
|
| 31 |
-
allow_headers=["*"],
|
| 32 |
-
)
|
| 33 |
-
|
| 34 |
-
app.include_router(health_router, prefix="/api")
|
| 35 |
-
app.include_router(datasets_router, prefix="/api")
|
| 36 |
-
app.include_router(presets_router, prefix="/api")
|
| 37 |
-
app.include_router(experiments_router, prefix="/api")
|
| 38 |
-
app.include_router(compare_router, prefix="/api")
|
| 39 |
-
app.include_router(exports_router, prefix="/api")
|
| 40 |
-
app.include_router(runs_router, prefix="/api")
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
@app.get("/")
|
| 44 |
-
def root():
|
| 45 |
-
return {"ok": True, "service": "clusterbuster-api"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/models/experiment.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from sqlalchemy import Column, Integer, String, Text
|
| 2 |
-
from backend.app.db import Base
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
class Experiment(Base):
|
| 6 |
-
__tablename__ = "experiments"
|
| 7 |
-
|
| 8 |
-
id = Column(String, primary_key=True, index=True)
|
| 9 |
-
dataset_id = Column(String, nullable=False, index=True)
|
| 10 |
-
algorithm = Column(String, nullable=False)
|
| 11 |
-
status = Column(String, nullable=False)
|
| 12 |
-
config_json = Column(Text, nullable=True)
|
| 13 |
-
metrics_json = Column(Text, nullable=True)
|
| 14 |
-
summary_json = Column(Text, nullable=True)
|
| 15 |
-
runtime_ms = Column(Integer, nullable=True)
|
| 16 |
-
error_message = Column(Text, nullable=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/repositories/experiment_repo.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
from sqlalchemy.orm import Session
|
| 2 |
-
|
| 3 |
-
from backend.app.models.experiment import Experiment
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def create_experiment(
|
| 7 |
-
db: Session,
|
| 8 |
-
id: str,
|
| 9 |
-
dataset_id: str,
|
| 10 |
-
algorithm: str,
|
| 11 |
-
status: str,
|
| 12 |
-
config_json: str | None = None,
|
| 13 |
-
metrics_json: str | None = None,
|
| 14 |
-
summary_json: str | None = None,
|
| 15 |
-
runtime_ms: int | None = None,
|
| 16 |
-
error_message: str | None = None,
|
| 17 |
-
) -> Experiment:
|
| 18 |
-
experiment = Experiment(
|
| 19 |
-
id=id,
|
| 20 |
-
dataset_id=dataset_id,
|
| 21 |
-
algorithm=algorithm,
|
| 22 |
-
status=status,
|
| 23 |
-
config_json=config_json,
|
| 24 |
-
metrics_json=metrics_json,
|
| 25 |
-
summary_json=summary_json,
|
| 26 |
-
runtime_ms=runtime_ms,
|
| 27 |
-
error_message=error_message,
|
| 28 |
-
)
|
| 29 |
-
db.add(experiment)
|
| 30 |
-
db.commit()
|
| 31 |
-
db.refresh(experiment)
|
| 32 |
-
return experiment
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def get_experiment(db: Session, experiment_id: str) -> Experiment | None:
|
| 36 |
-
return db.query(Experiment).filter(Experiment.id == experiment_id).first()
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def list_experiments(db: Session) -> list[Experiment]:
|
| 40 |
-
return db.query(Experiment).order_by(Experiment.id.desc()).all()
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def get_experiments_by_ids(db: Session, experiment_ids: list[str]) -> list[Experiment]:
|
| 44 |
-
if not experiment_ids:
|
| 45 |
-
return []
|
| 46 |
-
return db.query(Experiment).filter(Experiment.id.in_(experiment_ids)).all()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/profiling_service.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
def profile_dataframe(df):
|
| 2 |
-
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
|
| 3 |
-
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
|
| 4 |
-
|
| 5 |
-
recommended = []
|
| 6 |
-
|
| 7 |
-
if len(numeric_cols) > 0:
|
| 8 |
-
recommended.append("kmeans")
|
| 9 |
-
recommended.append("birch")
|
| 10 |
-
|
| 11 |
-
if len(categorical_cols) > 0:
|
| 12 |
-
recommended.append("agglomerative")
|
| 13 |
-
|
| 14 |
-
return {
|
| 15 |
-
"columns": [
|
| 16 |
-
{
|
| 17 |
-
"name": col,
|
| 18 |
-
"inferred_type": str(df[col].dtype),
|
| 19 |
-
"missing_pct": float(df[col].isna().mean()),
|
| 20 |
-
"cardinality": int(df[col].nunique())
|
| 21 |
-
}
|
| 22 |
-
for col in df.columns
|
| 23 |
-
],
|
| 24 |
-
"recommended_algorithms": recommended
|
| 25 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|