Spaces:
Sleeping
Sleeping
File size: 2,592 Bytes
4013eed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | import json
from pathlib import Path
import pandas as pd
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
from sqlalchemy.orm import Session
from backend.app.db import get_db
from backend.app.repositories.dataset_repo import create_dataset, get_dataset, list_datasets
from backend.app.services.profiling_service import profile_dataframe
from backend.app.utils.ids import make_dataset_id
router = APIRouter(tags=["datasets"])
UPLOAD_DIR = Path("/data/uploads")
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
@router.get("/datasets")
def datasets_list(db: Session = Depends(get_db)):
datasets = list_datasets(db)
return {
"datasets": [
{
"id": d.id,
"name": d.name,
"row_count": d.row_count,
"column_count": d.column_count,
}
for d in datasets
]
}
@router.post("/datasets/upload")
async def upload_dataset(file: UploadFile = File(...), db: Session = Depends(get_db)):
if not file.filename:
raise HTTPException(status_code=400, detail="Missing file name")
suffix = Path(file.filename).suffix.lower()
if suffix not in {".csv", ".xlsx", ".xls"}:
raise HTTPException(status_code=400, detail="Only CSV and Excel files are supported")
dataset_id = make_dataset_id()
path = UPLOAD_DIR / f"{dataset_id}{suffix}"
content = await file.read()
path.write_bytes(content)
if suffix == ".csv":
df = pd.read_csv(path)
else:
df = pd.read_excel(path)
profile = profile_dataframe(df)
create_dataset(
db=db,
id=dataset_id,
name=file.filename,
file_path=str(path),
row_count=int(len(df)),
column_count=int(len(df.columns)),
schema_json=json.dumps({"columns": list(df.columns)}),
profile_json=json.dumps(profile),
)
return {
"dataset_id": dataset_id,
"name": file.filename,
"row_count": int(len(df)),
"column_count": int(len(df.columns)),
}
@router.get("/datasets/{dataset_id}/profile")
def dataset_profile(dataset_id: str, db: Session = Depends(get_db)):
dataset = get_dataset(db, dataset_id)
if not dataset:
raise HTTPException(status_code=404, detail="Dataset not found")
return {
"dataset_id": dataset.id,
"name": dataset.name,
"row_count": dataset.row_count,
"column_count": dataset.column_count,
"schema": json.loads(dataset.schema_json),
"profile": json.loads(dataset.profile_json),
}
|