File size: 4,157 Bytes
d02bacd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict

import numpy as np
import pandas as pd


@dataclass
class CleaningSummary:
    df: pd.DataFrame
    duplicates_removed: int
    imputed_prices: int


def _rounded_mean(series: pd.Series) -> float:
    value = pd.to_numeric(series, errors="coerce").dropna().mean()
    if pd.isna(value):
        return 0.0
    return round(float(value), 2)


def clean_orders(df: pd.DataFrame) -> CleaningSummary:
    working = df.copy()
    before = len(working)
    working = working.drop_duplicates().reset_index(drop=True)
    duplicates_removed = before - len(working)
    prices = pd.to_numeric(working["Price"], errors="coerce")
    imputed_prices = int(prices.isna().sum())
    if imputed_prices:
        fill = _rounded_mean(prices)
        working["Price"] = prices.fillna(fill).round(2)
    else:
        working["Price"] = prices.round(2)
    return CleaningSummary(df=working, duplicates_removed=duplicates_removed, imputed_prices=imputed_prices)


def derive_revenue(df: pd.DataFrame) -> pd.DataFrame:
    working = df.copy()
    working["Price"] = pd.to_numeric(working["Price"], errors="coerce").round(2)
    working["Quantity"] = pd.to_numeric(working["Quantity"], errors="coerce")
    working["Revenue"] = (working["Price"] * working["Quantity"]).round(2)
    return working


def compute_kpis(df: pd.DataFrame) -> Dict[str, float]:
    working = derive_revenue(df)
    total_revenue = round(float(working["Revenue"].sum()), 2)
    order_count = int(working["OrderID"].nunique()) if "OrderID" in working.columns else len(working)
    avg_order_value = round(total_revenue / order_count, 2) if order_count else 0.0
    return {
        "total_revenue": total_revenue,
        "avg_order_value": avg_order_value,
        "order_count": float(order_count),
    }


def compute_revenue_share(df: pd.DataFrame) -> pd.DataFrame:
    working = derive_revenue(df)
    grouped = (
        working.groupby("Category", as_index=False)["Revenue"]
        .sum()
        .sort_values("Revenue", ascending=False)
        .reset_index(drop=True)
    )
    total = float(grouped["Revenue"].sum())
    grouped["RevenueShare"] = grouped["Revenue"].map(lambda v: round((float(v) / total) * 100, 2) if total else 0.0)
    return grouped


def monthly_revenue(df: pd.DataFrame) -> pd.DataFrame:
    working = derive_revenue(df)
    dates = pd.to_datetime(working["OrderDate"], errors="coerce")
    working = working.loc[dates.notna()].copy()
    working["OrderDate"] = dates.loc[dates.notna()]
    working["Month"] = working["OrderDate"].dt.to_period("M").astype(str)
    monthly = working.groupby("Month", as_index=False)["Revenue"].sum()
    return monthly.sort_values("Month").reset_index(drop=True)


def validate_schema(df: pd.DataFrame) -> Dict[str, Any]:
    working = df.copy()
    prices = pd.to_numeric(working.get("Price"), errors="coerce")
    qty = pd.to_numeric(working.get("Quantity"), errors="coerce")
    dates = pd.to_datetime(working.get("OrderDate"), errors="coerce")
    invalid_price_rows = int((prices.isna() | (prices <= 0)).sum())
    invalid_quantity_rows = int((qty.isna() | (qty < 1)).sum())
    invalid_date_rows = int(dates.isna().sum())
    risk_flags = []
    if invalid_price_rows:
        risk_flags.append("price")
    if invalid_quantity_rows:
        risk_flags.append("quantity")
    if invalid_date_rows:
        risk_flags.append("order_date")
    return {
        "invalid_price_rows": invalid_price_rows,
        "invalid_quantity_rows": invalid_quantity_rows,
        "invalid_date_rows": invalid_date_rows,
        "risk_flags": risk_flags,
    }


def data_quality_score(df: pd.DataFrame) -> float:
    if df.empty:
        return 0.0
    checks = pd.Series([True] * len(df), index=df.index)
    prices = pd.to_numeric(df.get("Price"), errors="coerce")
    qty = pd.to_numeric(df.get("Quantity"), errors="coerce")
    dates = pd.to_datetime(df.get("OrderDate"), errors="coerce")
    checks &= prices.notna() & (prices > 0)
    checks &= qty.notna() & (qty >= 1)
    checks &= dates.notna()
    return round(float(checks.mean()), 4)