uchihamadara1816's picture
Upload 172 files
d02bacd verified
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict
import numpy as np
import pandas as pd
@dataclass
class CleaningSummary:
df: pd.DataFrame
duplicates_removed: int
imputed_prices: int
def _rounded_mean(series: pd.Series) -> float:
value = pd.to_numeric(series, errors="coerce").dropna().mean()
if pd.isna(value):
return 0.0
return round(float(value), 2)
def clean_orders(df: pd.DataFrame) -> CleaningSummary:
working = df.copy()
before = len(working)
working = working.drop_duplicates().reset_index(drop=True)
duplicates_removed = before - len(working)
prices = pd.to_numeric(working["Price"], errors="coerce")
imputed_prices = int(prices.isna().sum())
if imputed_prices:
fill = _rounded_mean(prices)
working["Price"] = prices.fillna(fill).round(2)
else:
working["Price"] = prices.round(2)
return CleaningSummary(df=working, duplicates_removed=duplicates_removed, imputed_prices=imputed_prices)
def derive_revenue(df: pd.DataFrame) -> pd.DataFrame:
working = df.copy()
working["Price"] = pd.to_numeric(working["Price"], errors="coerce").round(2)
working["Quantity"] = pd.to_numeric(working["Quantity"], errors="coerce")
working["Revenue"] = (working["Price"] * working["Quantity"]).round(2)
return working
def compute_kpis(df: pd.DataFrame) -> Dict[str, float]:
working = derive_revenue(df)
total_revenue = round(float(working["Revenue"].sum()), 2)
order_count = int(working["OrderID"].nunique()) if "OrderID" in working.columns else len(working)
avg_order_value = round(total_revenue / order_count, 2) if order_count else 0.0
return {
"total_revenue": total_revenue,
"avg_order_value": avg_order_value,
"order_count": float(order_count),
}
def compute_revenue_share(df: pd.DataFrame) -> pd.DataFrame:
working = derive_revenue(df)
grouped = (
working.groupby("Category", as_index=False)["Revenue"]
.sum()
.sort_values("Revenue", ascending=False)
.reset_index(drop=True)
)
total = float(grouped["Revenue"].sum())
grouped["RevenueShare"] = grouped["Revenue"].map(lambda v: round((float(v) / total) * 100, 2) if total else 0.0)
return grouped
def monthly_revenue(df: pd.DataFrame) -> pd.DataFrame:
working = derive_revenue(df)
dates = pd.to_datetime(working["OrderDate"], errors="coerce")
working = working.loc[dates.notna()].copy()
working["OrderDate"] = dates.loc[dates.notna()]
working["Month"] = working["OrderDate"].dt.to_period("M").astype(str)
monthly = working.groupby("Month", as_index=False)["Revenue"].sum()
return monthly.sort_values("Month").reset_index(drop=True)
def validate_schema(df: pd.DataFrame) -> Dict[str, Any]:
working = df.copy()
prices = pd.to_numeric(working.get("Price"), errors="coerce")
qty = pd.to_numeric(working.get("Quantity"), errors="coerce")
dates = pd.to_datetime(working.get("OrderDate"), errors="coerce")
invalid_price_rows = int((prices.isna() | (prices <= 0)).sum())
invalid_quantity_rows = int((qty.isna() | (qty < 1)).sum())
invalid_date_rows = int(dates.isna().sum())
risk_flags = []
if invalid_price_rows:
risk_flags.append("price")
if invalid_quantity_rows:
risk_flags.append("quantity")
if invalid_date_rows:
risk_flags.append("order_date")
return {
"invalid_price_rows": invalid_price_rows,
"invalid_quantity_rows": invalid_quantity_rows,
"invalid_date_rows": invalid_date_rows,
"risk_flags": risk_flags,
}
def data_quality_score(df: pd.DataFrame) -> float:
if df.empty:
return 0.0
checks = pd.Series([True] * len(df), index=df.index)
prices = pd.to_numeric(df.get("Price"), errors="coerce")
qty = pd.to_numeric(df.get("Quantity"), errors="coerce")
dates = pd.to_datetime(df.get("OrderDate"), errors="coerce")
checks &= prices.notna() & (prices > 0)
checks &= qty.notna() & (qty >= 1)
checks &= dates.notna()
return round(float(checks.mean()), 4)