#!/usr/bin/env python3 """Data Cleaning Plugin""" import pandas as pd from typing import Dict, Any class DataCleaner: """Clean and standardize messy data for analytics.""" def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df.columns = df.columns.astype(str).str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True) df = df.dropna(how='all', axis=0).dropna(how='all', axis=1) null_values = ['', 'null', 'NULL', 'None', 'N/A', 'n/a', '#N/A', '-', '?', 'unknown'] for col in df.select_dtypes(include=['object', 'string']).columns: df[col] = df[col].astype(str).str.strip().replace(null_values, pd.NA) df = df.drop_duplicates() return df def enforce_schema(self, df: pd.DataFrame) -> pd.DataFrame: df = df.copy() for col in df.columns: try: if 'date' in col or 'time' in col: df[col] = pd.to_datetime(df[col], errors='coerce') elif any(kw in col for kw in ['amount', 'price', 'cost', 'value', 'count']): df[col] = pd.to_numeric(df[col], errors='coerce') except: pass return df