Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import re | |
| # extract numbers | |
| def extract_number(x): | |
| try: | |
| x = str(x).strip() | |
| # invalid values | |
| if x in ["", "|", "nan", "None"]: | |
| return np.nan | |
| # remove common symbols | |
| x = ( | |
| x.replace(",", "") | |
| .replace("₹", "") | |
| .replace("$", "") | |
| .replace("%", "") | |
| ) | |
| # range values | |
| if "-" in x: | |
| nums = x.split("-") | |
| if len(nums) == 2: | |
| try: | |
| return ( | |
| float(nums[0]) + | |
| float(nums[1]) | |
| ) / 2 | |
| except: | |
| pass | |
| # first numeric pattern | |
| num = re.findall( | |
| r"\d+\.?\d*", | |
| x | |
| ) | |
| if num: | |
| return float(num[0]) | |
| return np.nan | |
| except: | |
| return np.nan | |
| # numeric detection | |
| def is_numeric_like(series, threshold=0.70): | |
| try: | |
| converted = ( | |
| series.dropna() | |
| .astype(str) | |
| .apply(extract_number) | |
| ) | |
| success_rate = ( | |
| converted.notnull().mean() | |
| ) | |
| return success_rate >= threshold | |
| except: | |
| return False | |
| # data detection | |
| def looks_like_date(series): | |
| try: | |
| sample = ( | |
| series.dropna() | |
| .astype(str) | |
| .head(20) | |
| ) | |
| keywords = [ | |
| "-", "/", ":", | |
| "jan", "feb", "mar", | |
| "apr", "may", "jun", | |
| "jul", "aug", "sep", | |
| "oct", "nov", "dec", | |
| "202", "201", "200" | |
| ] | |
| return any( | |
| any( | |
| k in val.lower() | |
| for k in keywords | |
| ) | |
| for val in sample | |
| ) | |
| except: | |
| return False | |
| # date check | |
| def is_datetime_like(series, threshold=0.80): | |
| try: | |
| parsed = pd.to_datetime( | |
| series, | |
| errors="coerce" | |
| ) | |
| success_rate = ( | |
| parsed.notnull().mean() | |
| ) | |
| return success_rate >= threshold | |
| except: | |
| return False | |
| ### main function | |
| def preprocess(df, target): | |
| print("\n--- SMART PREPROCESSING ---") | |
| df = df.copy() | |
| report = { | |
| "converted_numeric": [], | |
| "converted_datetime": [], | |
| "dropped_columns": [], | |
| "warnings": [], | |
| "high_cardinality": [], | |
| "outliers": {}, | |
| "recommendations": [] | |
| } | |
| # remove duplicate | |
| dup_count = df.duplicated().sum() | |
| if dup_count > 0: | |
| print( | |
| f"Removing duplicate rows: " | |
| f"{dup_count}" | |
| ) | |
| df = df.drop_duplicates() | |
| # comvert date time | |
| for col in list(df.columns): | |
| if df[col].dtype == "object": | |
| if ( | |
| df[col].nunique() < 50 | |
| and looks_like_date(df[col]) | |
| and is_datetime_like(df[col]) | |
| ): | |
| print( | |
| f"Converting '{col}' " | |
| f"to datetime" | |
| ) | |
| df[col] = pd.to_datetime( | |
| df[col], | |
| errors="coerce" | |
| ) | |
| report[ | |
| "converted_datetime" | |
| ].append(col) | |
| # numeric conversion | |
| for col in list(df.columns): | |
| if df[col].dtype == "object": | |
| if is_numeric_like(df[col]): | |
| print( | |
| f"Converting '{col}' " | |
| f"to numeric" | |
| ) | |
| df[col] = ( | |
| df[col] | |
| .astype(str) | |
| .apply(extract_number) | |
| ) | |
| report[ | |
| "converted_numeric" | |
| ].append(col) | |
| # remove invalid target rows | |
| if target in df.columns: | |
| before = len(df) | |
| df = df.dropna(subset=[target]) | |
| removed = before - len(df) | |
| if removed > 0: | |
| print( | |
| f"Removed {removed} rows " | |
| f"with invalid target values" | |
| ) | |
| # drop id columns | |
| keywords = [ | |
| "id", | |
| "index", | |
| "code", | |
| "serial", | |
| "number" | |
| ] | |
| id_cols = [ | |
| col for col in df.columns | |
| if ( | |
| df[col].nunique(dropna=False) | |
| == len(df) | |
| ) | |
| and any( | |
| k in col.lower() | |
| for k in keywords | |
| ) | |
| and col != target | |
| ] | |
| if id_cols: | |
| print( | |
| "Dropping ID-like columns:", | |
| id_cols | |
| ) | |
| df = df.drop(columns=id_cols) | |
| report[ | |
| "dropped_columns" | |
| ].extend(id_cols) | |
| # drop constant column | |
| const_cols = [ | |
| col for col in df.columns | |
| if ( | |
| df[col] | |
| .nunique(dropna=False) | |
| <= 1 | |
| ) | |
| ] | |
| if const_cols: | |
| print( | |
| "Dropping constant columns:", | |
| const_cols | |
| ) | |
| df = df.drop(columns=const_cols) | |
| report[ | |
| "dropped_columns" | |
| ].extend(const_cols) | |
| # handle nulls | |
| for col in list(df.columns): | |
| if col == target: | |
| continue | |
| null_pct = ( | |
| df[col] | |
| .isnull() | |
| .mean() * 100 | |
| ) | |
| # too many nulls | |
| if null_pct > 40: | |
| print( | |
| f"Dropping '{col}' " | |
| f"(too many nulls: " | |
| f"{null_pct:.2f}%)" | |
| ) | |
| df = df.drop(columns=[col]) | |
| report[ | |
| "dropped_columns" | |
| ].append(col) | |
| continue | |
| # numerical | |
| if pd.api.types.is_numeric_dtype(df[col]): | |
| if null_pct > 0: | |
| if abs(df[col].skew()) > 1: | |
| df[col] = ( | |
| df[col] | |
| .fillna( | |
| df[col].median() | |
| ) | |
| ) | |
| else: | |
| df[col] = ( | |
| df[col] | |
| .fillna( | |
| df[col].mean() | |
| ) | |
| ) | |
| # categorical | |
| else: | |
| if null_pct > 0: | |
| if not df[col].mode().empty: | |
| df[col] = ( | |
| df[col] | |
| .fillna( | |
| df[col].mode()[0] | |
| ) | |
| ) | |
| # high card | |
| for col in df.select_dtypes( | |
| include="object" | |
| ): | |
| unique_vals = df[col].nunique() | |
| if unique_vals > 50: | |
| print( | |
| f"High cardinality " | |
| f"column: '{col}' " | |
| f"({unique_vals} unique)" | |
| ) | |
| report[ | |
| "high_cardinality" | |
| ].append(col) | |
| # outlier detection | |
| print("\nOutlier Detections:") | |
| for col in df.select_dtypes( | |
| include=["int64", "float64"] | |
| ): | |
| if df[col].nunique() <= 10: | |
| continue | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| if IQR == 0: | |
| continue | |
| lower = Q1 - 1.5 * IQR | |
| upper = Q3 + 1.5 * IQR | |
| count = ( | |
| ( | |
| (df[col] < lower) | |
| | | |
| (df[col] > upper) | |
| ) | |
| ).sum() | |
| ratio = count / len(df) | |
| if ratio > 0.05: | |
| print( | |
| f"- {col}: " | |
| f"{count} outliers " | |
| f"({ratio:.1%}) " | |
| f"--> high" | |
| ) | |
| elif ratio > 0: | |
| print( | |
| f"- {col}: " | |
| f"{count} outliers " | |
| f"({ratio:.1%})" | |
| ) | |
| report["outliers"][col] = int(count) | |
| #### final summary | |
| num_cols = df.select_dtypes( | |
| include=["int64", "float64"] | |
| ).columns | |
| cat_cols = df.select_dtypes( | |
| include="object" | |
| ).columns | |
| date_cols = df.select_dtypes( | |
| include="datetime64[ns]" | |
| ).columns | |
| print("\nFinal Data Summary:") | |
| print(f"- Rows: {df.shape[0]}") | |
| print(f"- Columns: {df.shape[1]}") | |
| print(f"- Numerical columns: {len(num_cols)}") | |
| print(f"- Categorical columns: {len(cat_cols)}") | |
| print(f"- Datetime columns: {len(date_cols)}") | |
| return df | |