Spaces:

kashh65
/

autom_cerebras

Sleeping

autom_cerebras / src /preprocessing /clean_df_fallback.py

akash

first_commit

8c22e77 9 months ago

6.29 kB

	import re
	import pandas as pd
	import numpy as np
	import streamlit as st

	# Define fallback cleaning function

	@st.cache_data
	def clean_dataframe_fallback(df):
	"""Hardcoded data cleaning pipeline"""

	"""Generic data cleaning pipeline with categorical preservation"""
	df_cleaned = df.copy()


	df_cleaned = df_cleaned.applymap(
	lambda x: re.sub(r"$.*?$", "", str(x)) if isinstance(x, str) else x)

	# Remove 'ref.' references
	df_cleaned = df_cleaned.applymap(
	lambda x: re.sub(r"ref\.", "", str(x), flags=re.IGNORECASE) if isinstance(x, str) else x)

	# Remove any other special characters except letters, digits, spaces, and dots
	df_cleaned = df_cleaned.applymap(
	lambda x: re.sub(r"[^\w\s\d\.]", "", str(x)).strip() if isinstance(x, str) else x
	)


	# Step 0 - Clean column names first
	df_cleaned.columns = [col.strip().lower().replace(' ', '_') for col in df_cleaned.columns]

	# Define measurement units to remove
	measurement_units = {
	'weight': r'\s*(kg\|kilograms\|lbs\|pounds)$',
	'height': r'\s*(cm\|centimeters\|inches\|feet\|ft)$'
	}


	# Step 1 - Remove redundant columns
	# Preservation patterns for categorical columns
	preserve_pattern = re.compile(r'(name\|brand\|model\|type\|category\|region\|text\|desc\|color\|size)', re.IGNORECASE)
	preserved_cols = [col for col in df_cleaned.columns if preserve_pattern.search(col)]

	# ID pattern detection
	id_pattern = re.compile(r'(_id\|id_\|num\|no\|number\|identifier\|code\|idx\|row)', re.IGNORECASE)
	id_cols = [col for col in df_cleaned.columns if id_pattern.search(col) and col not in preserved_cols]

	# Unique value columns
	unique_cols = [col for col in df_cleaned.columns
	if df_cleaned[col].nunique() == len(df_cleaned)
	and col not in preserved_cols]

	redundant_cols = list(set(id_cols + unique_cols))
	df_cleaned = df_cleaned.drop(columns=redundant_cols)
	print(f"Removed {len(redundant_cols)} redundant columns: {redundant_cols}")

	# Step 2 - Enhanced numeric detection with categorical protection
	for col in df_cleaned.columns:
	if col in preserved_cols:
	print(f"Preserving categorical column: {col}")
	continue # Skip preserved columns

	if any(unit in col for unit in measurement_units.keys()):
	pattern = measurement_units.get(col.split('_')[0], r'')
	df_cleaned[col] = df_cleaned[col].astype(str).str.replace(pattern, '', regex=True).str.strip()



	if pd.api.types.is_numeric_dtype(df_cleaned[col]):
	continue

	# Strict numeric pattern detection
	non_null_count = df_cleaned[col].dropna().shape[0]
	sample_size = min(100, non_null_count)
	sample = df_cleaned[col].dropna().sample(sample_size, random_state=42)
	numeric_pattern = r'^[-+]?\d*\.?\d+$' # Full string match
	num_matches = sample.astype(str).str.fullmatch(numeric_pattern).mean()

	if num_matches > 0.8: # High threshold
	# Conservative cleaning
	cleaned = df_cleaned[col].replace(r'[^\d\.\-]', '', regex=True)
	converted = pd.to_numeric(cleaned, errors='coerce')
	success_rate = converted.notna().mean()

	if success_rate > 0.9: # Strict success requirement
	df_cleaned[col] = converted
	print(f"Converted {col} to numeric (success: {success_rate:.1%})")

	# Step 3 - Date detection
	date_cols = []
	for col in df_cleaned.select_dtypes(exclude=np.number).columns:
	if col in preserved_cols:
	continue
	try:
	df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='raise')
	date_cols.append(col)
	print(f"Detected datetime: {col}")
	except:
	pass

	# Example manual approach:
	currency_cols = [col for col in df_cleaned.columns if any(keyword in col.lower() for keyword in ["price", "gross", "budget"])]
	for col in currency_cols:
	df_cleaned[col] = df_cleaned[col].astype(str).str.replace(r'[^\d\.]', '', regex=True) # remove everything except digits & dots
	df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')



	# Step 4 - Missing value handling
	numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
	categorical_cols = df_cleaned.select_dtypes(exclude=np.number).columns

	# Numeric imputation
	for col in numeric_cols:
	if df_cleaned[col].isna().any():
	df_cleaned[f'{col}_missing'] = df_cleaned[col].isna().astype(int)
	df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)

	# Categorical imputation
	for col in categorical_cols:
	if df_cleaned[col].isna().any():
	mode_val = df_cleaned[col].mode()[0] if not df_cleaned[col].mode().empty else 'Unknown'
	df_cleaned[col] = df_cleaned[col].fillna(mode_val)

	# Step 5 - Text normalization for non-preserved columns
	text_cols = [col for col in categorical_cols if col not in preserved_cols]
	for col in text_cols:
	df_cleaned[col] = df_cleaned[col].astype(str).apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', x)).strip().lower())


	# Step 6 - Outlier handling (preserve categoricals)
	numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
	for col in numeric_cols:
	if df_cleaned[col].nunique() > 10:
	q1 = df_cleaned[col].quantile(0.05)
	q3 = df_cleaned[col].quantile(0.95)
	df_cleaned[col] = np.clip(df_cleaned[col], q1, q3)

	# Step 7 - Final validation
	df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True)

	return df_cleaned