import json from pathlib import Path import pandas as pd from fastapi import APIRouter, Depends, File, HTTPException, UploadFile from sqlalchemy.orm import Session from backend.app.db import get_db from backend.app.repositories.dataset_repo import create_dataset, get_dataset, list_datasets from backend.app.services.profiling_service import profile_dataframe from backend.app.utils.ids import make_dataset_id router = APIRouter(tags=["datasets"]) UPLOAD_DIR = Path("/data/uploads") UPLOAD_DIR.mkdir(parents=True, exist_ok=True) @router.get("/datasets") def datasets_list(db: Session = Depends(get_db)): datasets = list_datasets(db) return { "datasets": [ { "id": d.id, "name": d.name, "row_count": d.row_count, "column_count": d.column_count, } for d in datasets ] } @router.post("/datasets/upload") async def upload_dataset(file: UploadFile = File(...), db: Session = Depends(get_db)): if not file.filename: raise HTTPException(status_code=400, detail="Missing file name") suffix = Path(file.filename).suffix.lower() if suffix not in {".csv", ".xlsx", ".xls"}: raise HTTPException(status_code=400, detail="Only CSV and Excel files are supported") dataset_id = make_dataset_id() path = UPLOAD_DIR / f"{dataset_id}{suffix}" content = await file.read() path.write_bytes(content) if suffix == ".csv": df = pd.read_csv(path) else: df = pd.read_excel(path) profile = profile_dataframe(df) create_dataset( db=db, id=dataset_id, name=file.filename, file_path=str(path), row_count=int(len(df)), column_count=int(len(df.columns)), schema_json=json.dumps({"columns": list(df.columns)}), profile_json=json.dumps(profile), ) return { "dataset_id": dataset_id, "name": file.filename, "row_count": int(len(df)), "column_count": int(len(df.columns)), } @router.get("/datasets/{dataset_id}/profile") def dataset_profile(dataset_id: str, db: Session = Depends(get_db)): dataset = get_dataset(db, dataset_id) if not dataset: raise HTTPException(status_code=404, detail="Dataset not found") return { "dataset_id": dataset.id, "name": dataset.name, "row_count": dataset.row_count, "column_count": dataset.column_count, "schema": json.loads(dataset.schema_json), "profile": json.loads(dataset.profile_json), }