Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import Any, Dict | |
| import numpy as np | |
| import pandas as pd | |
| class CleaningSummary: | |
| df: pd.DataFrame | |
| duplicates_removed: int | |
| imputed_prices: int | |
| def _rounded_mean(series: pd.Series) -> float: | |
| value = pd.to_numeric(series, errors="coerce").dropna().mean() | |
| if pd.isna(value): | |
| return 0.0 | |
| return round(float(value), 2) | |
| def clean_orders(df: pd.DataFrame) -> CleaningSummary: | |
| working = df.copy() | |
| before = len(working) | |
| working = working.drop_duplicates().reset_index(drop=True) | |
| duplicates_removed = before - len(working) | |
| prices = pd.to_numeric(working["Price"], errors="coerce") | |
| imputed_prices = int(prices.isna().sum()) | |
| if imputed_prices: | |
| fill = _rounded_mean(prices) | |
| working["Price"] = prices.fillna(fill).round(2) | |
| else: | |
| working["Price"] = prices.round(2) | |
| return CleaningSummary(df=working, duplicates_removed=duplicates_removed, imputed_prices=imputed_prices) | |
| def derive_revenue(df: pd.DataFrame) -> pd.DataFrame: | |
| working = df.copy() | |
| working["Price"] = pd.to_numeric(working["Price"], errors="coerce").round(2) | |
| working["Quantity"] = pd.to_numeric(working["Quantity"], errors="coerce") | |
| working["Revenue"] = (working["Price"] * working["Quantity"]).round(2) | |
| return working | |
| def compute_kpis(df: pd.DataFrame) -> Dict[str, float]: | |
| working = derive_revenue(df) | |
| total_revenue = round(float(working["Revenue"].sum()), 2) | |
| order_count = int(working["OrderID"].nunique()) if "OrderID" in working.columns else len(working) | |
| avg_order_value = round(total_revenue / order_count, 2) if order_count else 0.0 | |
| return { | |
| "total_revenue": total_revenue, | |
| "avg_order_value": avg_order_value, | |
| "order_count": float(order_count), | |
| } | |
| def compute_revenue_share(df: pd.DataFrame) -> pd.DataFrame: | |
| working = derive_revenue(df) | |
| grouped = ( | |
| working.groupby("Category", as_index=False)["Revenue"] | |
| .sum() | |
| .sort_values("Revenue", ascending=False) | |
| .reset_index(drop=True) | |
| ) | |
| total = float(grouped["Revenue"].sum()) | |
| grouped["RevenueShare"] = grouped["Revenue"].map(lambda v: round((float(v) / total) * 100, 2) if total else 0.0) | |
| return grouped | |
| def monthly_revenue(df: pd.DataFrame) -> pd.DataFrame: | |
| working = derive_revenue(df) | |
| dates = pd.to_datetime(working["OrderDate"], errors="coerce") | |
| working = working.loc[dates.notna()].copy() | |
| working["OrderDate"] = dates.loc[dates.notna()] | |
| working["Month"] = working["OrderDate"].dt.to_period("M").astype(str) | |
| monthly = working.groupby("Month", as_index=False)["Revenue"].sum() | |
| return monthly.sort_values("Month").reset_index(drop=True) | |
| def validate_schema(df: pd.DataFrame) -> Dict[str, Any]: | |
| working = df.copy() | |
| prices = pd.to_numeric(working.get("Price"), errors="coerce") | |
| qty = pd.to_numeric(working.get("Quantity"), errors="coerce") | |
| dates = pd.to_datetime(working.get("OrderDate"), errors="coerce") | |
| invalid_price_rows = int((prices.isna() | (prices <= 0)).sum()) | |
| invalid_quantity_rows = int((qty.isna() | (qty < 1)).sum()) | |
| invalid_date_rows = int(dates.isna().sum()) | |
| risk_flags = [] | |
| if invalid_price_rows: | |
| risk_flags.append("price") | |
| if invalid_quantity_rows: | |
| risk_flags.append("quantity") | |
| if invalid_date_rows: | |
| risk_flags.append("order_date") | |
| return { | |
| "invalid_price_rows": invalid_price_rows, | |
| "invalid_quantity_rows": invalid_quantity_rows, | |
| "invalid_date_rows": invalid_date_rows, | |
| "risk_flags": risk_flags, | |
| } | |
| def data_quality_score(df: pd.DataFrame) -> float: | |
| if df.empty: | |
| return 0.0 | |
| checks = pd.Series([True] * len(df), index=df.index) | |
| prices = pd.to_numeric(df.get("Price"), errors="coerce") | |
| qty = pd.to_numeric(df.get("Quantity"), errors="coerce") | |
| dates = pd.to_datetime(df.get("OrderDate"), errors="coerce") | |
| checks &= prices.notna() & (prices > 0) | |
| checks &= qty.notna() & (qty >= 1) | |
| checks &= dates.notna() | |
| return round(float(checks.mean()), 4) | |