File size: 4,028 Bytes
1067825 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import pandas as pd
import math
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from typing import Any, Dict, cast
import math
import numpy as np
def clean_data(data):
'''
Make the API response JSON-safe.
FastAPI cannot return NaN/Inf in JSON.
'''
if isinstance(data, dict):
return {k: clean_data(v) for k, v in data.items()}
elif isinstance(data, list):
return [clean_data(v) for v in data]
elif isinstance(data, (np.integer, np.floating)):
value = float(data)
if math.isnan(value) or math.isinf(value):
return None
return value
elif isinstance(data, float):
if math.isnan(data) or math.isinf(data):
return None
return data
return data
def get_correlation(df, target)-> Dict[str, Any]:
'''
Says this returns a dictionary
'''
df_processed = df.copy()
id_columns = [col for col in df_processed.columns if col.lower() == "id"]
df_processed = df_processed.drop(columns=id_columns)
for col in df_processed.columns:
if pd.api.types.is_numeric_dtype(df_processed[col]):
df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
else:
df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
encoded_columns = []
le = LabelEncoder()
for col in df_processed.select_dtypes(include="object").columns:
df_processed[col] = le.fit_transform(df_processed[col])
encoded_columns.append(col)
# keep Numeric data
df_numeric = df_processed.select_dtypes(include="number")
# Remove constant columns
df_numeric = df_numeric.loc[:, df_numeric.nunique() > 1]
if df_numeric.shape[1] < 2:
return {
"message": "Not enough numeric columns for correlation"
}
# Always compute correlation
pearson_df = df_numeric.corr()
pearson_df = pearson_df.replace([np.inf, -np.inf], np.nan)
pearson_df = pearson_df.fillna(0)
pearson_corr = pearson_df.to_dict()
spearman_df = df_numeric.corr(method="spearman")
spearman_df = spearman_df.replace([np.inf, -np.inf], np.nan)
spearman_df = spearman_df.fillna(0)
spearman_corr = spearman_df.to_dict()
# EDA
if not target:
'''
cast(type, value) = “pretend this value is this type”
'''
return cast(Dict[str, Any], clean_data({
"mode": "eda",
"rows": df.shape[0],
"columns": df.shape[1],
"column_names": df.columns.to_list(),
"encoded_columns": encoded_columns,
"final_column_count": df_numeric.shape[1],
"pearson": pearson_corr,
"spearman": spearman_corr
}))
# ML MODE
if target not in df_processed.columns:
raise ValueError(f"Target column '{target}' not found")
# Encode target if needed
if not pd.api.types.is_numeric_dtype(df_processed[target]):
df_processed[target] = LabelEncoder().fit_transform(df_processed[target])
X = df_processed.drop(columns=[target]).select_dtypes(include="number")
y = df_processed[target]
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
feature_importance = {
col: round(float(imp), 4)
for col, imp in zip(X.columns, model.feature_importances_)
}
feature_importance = dict(
sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
)
return cast(Dict[str, Any], clean_data({
"mode": "ml",
"rows": df.shape[0],
"columns": df.shape[1],
"column_names": df.columns.to_list(),
"encoded_columns": encoded_columns,
"final_column_count": df_numeric.shape[1],
"pearson": pearson_corr,
"spearman": spearman_corr,
"feature_importance": dict(list(feature_importance.items())[:5]),
}))
|