Spaces:
Sleeping
Sleeping
File size: 4,157 Bytes
d02bacd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict
import numpy as np
import pandas as pd
@dataclass
class CleaningSummary:
df: pd.DataFrame
duplicates_removed: int
imputed_prices: int
def _rounded_mean(series: pd.Series) -> float:
value = pd.to_numeric(series, errors="coerce").dropna().mean()
if pd.isna(value):
return 0.0
return round(float(value), 2)
def clean_orders(df: pd.DataFrame) -> CleaningSummary:
working = df.copy()
before = len(working)
working = working.drop_duplicates().reset_index(drop=True)
duplicates_removed = before - len(working)
prices = pd.to_numeric(working["Price"], errors="coerce")
imputed_prices = int(prices.isna().sum())
if imputed_prices:
fill = _rounded_mean(prices)
working["Price"] = prices.fillna(fill).round(2)
else:
working["Price"] = prices.round(2)
return CleaningSummary(df=working, duplicates_removed=duplicates_removed, imputed_prices=imputed_prices)
def derive_revenue(df: pd.DataFrame) -> pd.DataFrame:
working = df.copy()
working["Price"] = pd.to_numeric(working["Price"], errors="coerce").round(2)
working["Quantity"] = pd.to_numeric(working["Quantity"], errors="coerce")
working["Revenue"] = (working["Price"] * working["Quantity"]).round(2)
return working
def compute_kpis(df: pd.DataFrame) -> Dict[str, float]:
working = derive_revenue(df)
total_revenue = round(float(working["Revenue"].sum()), 2)
order_count = int(working["OrderID"].nunique()) if "OrderID" in working.columns else len(working)
avg_order_value = round(total_revenue / order_count, 2) if order_count else 0.0
return {
"total_revenue": total_revenue,
"avg_order_value": avg_order_value,
"order_count": float(order_count),
}
def compute_revenue_share(df: pd.DataFrame) -> pd.DataFrame:
working = derive_revenue(df)
grouped = (
working.groupby("Category", as_index=False)["Revenue"]
.sum()
.sort_values("Revenue", ascending=False)
.reset_index(drop=True)
)
total = float(grouped["Revenue"].sum())
grouped["RevenueShare"] = grouped["Revenue"].map(lambda v: round((float(v) / total) * 100, 2) if total else 0.0)
return grouped
def monthly_revenue(df: pd.DataFrame) -> pd.DataFrame:
working = derive_revenue(df)
dates = pd.to_datetime(working["OrderDate"], errors="coerce")
working = working.loc[dates.notna()].copy()
working["OrderDate"] = dates.loc[dates.notna()]
working["Month"] = working["OrderDate"].dt.to_period("M").astype(str)
monthly = working.groupby("Month", as_index=False)["Revenue"].sum()
return monthly.sort_values("Month").reset_index(drop=True)
def validate_schema(df: pd.DataFrame) -> Dict[str, Any]:
working = df.copy()
prices = pd.to_numeric(working.get("Price"), errors="coerce")
qty = pd.to_numeric(working.get("Quantity"), errors="coerce")
dates = pd.to_datetime(working.get("OrderDate"), errors="coerce")
invalid_price_rows = int((prices.isna() | (prices <= 0)).sum())
invalid_quantity_rows = int((qty.isna() | (qty < 1)).sum())
invalid_date_rows = int(dates.isna().sum())
risk_flags = []
if invalid_price_rows:
risk_flags.append("price")
if invalid_quantity_rows:
risk_flags.append("quantity")
if invalid_date_rows:
risk_flags.append("order_date")
return {
"invalid_price_rows": invalid_price_rows,
"invalid_quantity_rows": invalid_quantity_rows,
"invalid_date_rows": invalid_date_rows,
"risk_flags": risk_flags,
}
def data_quality_score(df: pd.DataFrame) -> float:
if df.empty:
return 0.0
checks = pd.Series([True] * len(df), index=df.index)
prices = pd.to_numeric(df.get("Price"), errors="coerce")
qty = pd.to_numeric(df.get("Quantity"), errors="coerce")
dates = pd.to_datetime(df.get("OrderDate"), errors="coerce")
checks &= prices.notna() & (prices > 0)
checks &= qty.notna() & (qty >= 1)
checks &= dates.notna()
return round(float(checks.mean()), 4)
|