JatinAutonomousLabs's picture
Upload 4 files
9d08bab verified
#!/usr/bin/env python3
"""Data Cleaning Plugin"""
import pandas as pd
from typing import Dict, Any
class DataCleaner:
"""Clean and standardize messy data for analytics."""
def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df.columns = df.columns.astype(str).str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True)
df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
null_values = ['', 'null', 'NULL', 'None', 'N/A', 'n/a', '#N/A', '-', '?', 'unknown']
for col in df.select_dtypes(include=['object', 'string']).columns:
df[col] = df[col].astype(str).str.strip().replace(null_values, pd.NA)
df = df.drop_duplicates()
return df
def enforce_schema(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
for col in df.columns:
try:
if 'date' in col or 'time' in col:
df[col] = pd.to_datetime(df[col], errors='coerce')
elif any(kw in col for kw in ['amount', 'price', 'cost', 'value', 'count']):
df[col] = pd.to_numeric(df[col], errors='coerce')
except: pass
return df