ClusterBuster-API / backend /app /api /datasets.py
Adisri99's picture
Upload 26 files
4013eed verified
import json
from pathlib import Path
import pandas as pd
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
from sqlalchemy.orm import Session
from backend.app.db import get_db
from backend.app.repositories.dataset_repo import create_dataset, get_dataset, list_datasets
from backend.app.services.profiling_service import profile_dataframe
from backend.app.utils.ids import make_dataset_id
router = APIRouter(tags=["datasets"])
UPLOAD_DIR = Path("/data/uploads")
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
@router.get("/datasets")
def datasets_list(db: Session = Depends(get_db)):
datasets = list_datasets(db)
return {
"datasets": [
{
"id": d.id,
"name": d.name,
"row_count": d.row_count,
"column_count": d.column_count,
}
for d in datasets
]
}
@router.post("/datasets/upload")
async def upload_dataset(file: UploadFile = File(...), db: Session = Depends(get_db)):
if not file.filename:
raise HTTPException(status_code=400, detail="Missing file name")
suffix = Path(file.filename).suffix.lower()
if suffix not in {".csv", ".xlsx", ".xls"}:
raise HTTPException(status_code=400, detail="Only CSV and Excel files are supported")
dataset_id = make_dataset_id()
path = UPLOAD_DIR / f"{dataset_id}{suffix}"
content = await file.read()
path.write_bytes(content)
if suffix == ".csv":
df = pd.read_csv(path)
else:
df = pd.read_excel(path)
profile = profile_dataframe(df)
create_dataset(
db=db,
id=dataset_id,
name=file.filename,
file_path=str(path),
row_count=int(len(df)),
column_count=int(len(df.columns)),
schema_json=json.dumps({"columns": list(df.columns)}),
profile_json=json.dumps(profile),
)
return {
"dataset_id": dataset_id,
"name": file.filename,
"row_count": int(len(df)),
"column_count": int(len(df.columns)),
}
@router.get("/datasets/{dataset_id}/profile")
def dataset_profile(dataset_id: str, db: Session = Depends(get_db)):
dataset = get_dataset(db, dataset_id)
if not dataset:
raise HTTPException(status_code=404, detail="Dataset not found")
return {
"dataset_id": dataset.id,
"name": dataset.name,
"row_count": dataset.row_count,
"column_count": dataset.column_count,
"schema": json.loads(dataset.schema_json),
"profile": json.loads(dataset.profile_json),
}