Spaces:

uchihamadara1816
/

AutoDataLab2.0

Sleeping

App Files Files Community

AutoDataLab2.0 / subenvs /autodatalab /analytics.py

uchihamadara1816

Upload 172 files

d02bacd verified 25 days ago

raw

history blame contribute delete

4.16 kB

	from __future__ import annotations

	from dataclasses import dataclass
	from typing import Any, Dict

	import numpy as np
	import pandas as pd


	@dataclass
	class CleaningSummary:
	df: pd.DataFrame
	duplicates_removed: int
	imputed_prices: int


	def _rounded_mean(series: pd.Series) -> float:
	value = pd.to_numeric(series, errors="coerce").dropna().mean()
	if pd.isna(value):
	return 0.0
	return round(float(value), 2)


	def clean_orders(df: pd.DataFrame) -> CleaningSummary:
	working = df.copy()
	before = len(working)
	working = working.drop_duplicates().reset_index(drop=True)
	duplicates_removed = before - len(working)
	prices = pd.to_numeric(working["Price"], errors="coerce")
	imputed_prices = int(prices.isna().sum())
	if imputed_prices:
	fill = _rounded_mean(prices)
	working["Price"] = prices.fillna(fill).round(2)
	else:
	working["Price"] = prices.round(2)
	return CleaningSummary(df=working, duplicates_removed=duplicates_removed, imputed_prices=imputed_prices)


	def derive_revenue(df: pd.DataFrame) -> pd.DataFrame:
	working = df.copy()
	working["Price"] = pd.to_numeric(working["Price"], errors="coerce").round(2)
	working["Quantity"] = pd.to_numeric(working["Quantity"], errors="coerce")
	working["Revenue"] = (working["Price"] * working["Quantity"]).round(2)
	return working


	def compute_kpis(df: pd.DataFrame) -> Dict[str, float]:
	working = derive_revenue(df)
	total_revenue = round(float(working["Revenue"].sum()), 2)
	order_count = int(working["OrderID"].nunique()) if "OrderID" in working.columns else len(working)
	avg_order_value = round(total_revenue / order_count, 2) if order_count else 0.0
	return {
	"total_revenue": total_revenue,
	"avg_order_value": avg_order_value,
	"order_count": float(order_count),
	}


	def compute_revenue_share(df: pd.DataFrame) -> pd.DataFrame:
	working = derive_revenue(df)
	grouped = (
	working.groupby("Category", as_index=False)["Revenue"]
	.sum()
	.sort_values("Revenue", ascending=False)
	.reset_index(drop=True)
	)
	total = float(grouped["Revenue"].sum())
	grouped["RevenueShare"] = grouped["Revenue"].map(lambda v: round((float(v) / total) * 100, 2) if total else 0.0)
	return grouped


	def monthly_revenue(df: pd.DataFrame) -> pd.DataFrame:
	working = derive_revenue(df)
	dates = pd.to_datetime(working["OrderDate"], errors="coerce")
	working = working.loc[dates.notna()].copy()
	working["OrderDate"] = dates.loc[dates.notna()]
	working["Month"] = working["OrderDate"].dt.to_period("M").astype(str)
	monthly = working.groupby("Month", as_index=False)["Revenue"].sum()
	return monthly.sort_values("Month").reset_index(drop=True)


	def validate_schema(df: pd.DataFrame) -> Dict[str, Any]:
	working = df.copy()
	prices = pd.to_numeric(working.get("Price"), errors="coerce")
	qty = pd.to_numeric(working.get("Quantity"), errors="coerce")
	dates = pd.to_datetime(working.get("OrderDate"), errors="coerce")
	invalid_price_rows = int((prices.isna() \| (prices <= 0)).sum())
	invalid_quantity_rows = int((qty.isna() \| (qty < 1)).sum())
	invalid_date_rows = int(dates.isna().sum())
	risk_flags = []
	if invalid_price_rows:
	risk_flags.append("price")
	if invalid_quantity_rows:
	risk_flags.append("quantity")
	if invalid_date_rows:
	risk_flags.append("order_date")
	return {
	"invalid_price_rows": invalid_price_rows,
	"invalid_quantity_rows": invalid_quantity_rows,
	"invalid_date_rows": invalid_date_rows,
	"risk_flags": risk_flags,
	}


	def data_quality_score(df: pd.DataFrame) -> float:
	if df.empty:
	return 0.0
	checks = pd.Series([True] * len(df), index=df.index)
	prices = pd.to_numeric(df.get("Price"), errors="coerce")
	qty = pd.to_numeric(df.get("Quantity"), errors="coerce")
	dates = pd.to_datetime(df.get("OrderDate"), errors="coerce")
	checks &= prices.notna() & (prices > 0)
	checks &= qty.notna() & (qty >= 1)
	checks &= dates.notna()
	return round(float(checks.mean()), 4)