Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| import pandas as pd | |
| from fastapi import APIRouter, Depends, File, HTTPException, UploadFile | |
| from sqlalchemy.orm import Session | |
| from backend.app.db import get_db | |
| from backend.app.repositories.dataset_repo import create_dataset, get_dataset, list_datasets | |
| from backend.app.services.profiling_service import profile_dataframe | |
| from backend.app.utils.ids import make_dataset_id | |
| router = APIRouter(tags=["datasets"]) | |
| UPLOAD_DIR = Path("/data/uploads") | |
| UPLOAD_DIR.mkdir(parents=True, exist_ok=True) | |
| def datasets_list(db: Session = Depends(get_db)): | |
| datasets = list_datasets(db) | |
| return { | |
| "datasets": [ | |
| { | |
| "id": d.id, | |
| "name": d.name, | |
| "row_count": d.row_count, | |
| "column_count": d.column_count, | |
| } | |
| for d in datasets | |
| ] | |
| } | |
| async def upload_dataset(file: UploadFile = File(...), db: Session = Depends(get_db)): | |
| if not file.filename: | |
| raise HTTPException(status_code=400, detail="Missing file name") | |
| suffix = Path(file.filename).suffix.lower() | |
| if suffix not in {".csv", ".xlsx", ".xls"}: | |
| raise HTTPException(status_code=400, detail="Only CSV and Excel files are supported") | |
| dataset_id = make_dataset_id() | |
| path = UPLOAD_DIR / f"{dataset_id}{suffix}" | |
| content = await file.read() | |
| path.write_bytes(content) | |
| if suffix == ".csv": | |
| df = pd.read_csv(path) | |
| else: | |
| df = pd.read_excel(path) | |
| profile = profile_dataframe(df) | |
| create_dataset( | |
| db=db, | |
| id=dataset_id, | |
| name=file.filename, | |
| file_path=str(path), | |
| row_count=int(len(df)), | |
| column_count=int(len(df.columns)), | |
| schema_json=json.dumps({"columns": list(df.columns)}), | |
| profile_json=json.dumps(profile), | |
| ) | |
| return { | |
| "dataset_id": dataset_id, | |
| "name": file.filename, | |
| "row_count": int(len(df)), | |
| "column_count": int(len(df.columns)), | |
| } | |
| def dataset_profile(dataset_id: str, db: Session = Depends(get_db)): | |
| dataset = get_dataset(db, dataset_id) | |
| if not dataset: | |
| raise HTTPException(status_code=404, detail="Dataset not found") | |
| return { | |
| "dataset_id": dataset.id, | |
| "name": dataset.name, | |
| "row_count": dataset.row_count, | |
| "column_count": dataset.column_count, | |
| "schema": json.loads(dataset.schema_json), | |
| "profile": json.loads(dataset.profile_json), | |
| } | |