DataDecoder / correlationService.py
dewmisam's picture
Upload 19 files
1067825 verified
import pandas as pd
import math
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from typing import Any, Dict, cast
import math
import numpy as np
def clean_data(data):
'''
Make the API response JSON-safe.
FastAPI cannot return NaN/Inf in JSON.
'''
if isinstance(data, dict):
return {k: clean_data(v) for k, v in data.items()}
elif isinstance(data, list):
return [clean_data(v) for v in data]
elif isinstance(data, (np.integer, np.floating)):
value = float(data)
if math.isnan(value) or math.isinf(value):
return None
return value
elif isinstance(data, float):
if math.isnan(data) or math.isinf(data):
return None
return data
return data
def get_correlation(df, target)-> Dict[str, Any]:
'''
Says this returns a dictionary
'''
df_processed = df.copy()
id_columns = [col for col in df_processed.columns if col.lower() == "id"]
df_processed = df_processed.drop(columns=id_columns)
for col in df_processed.columns:
if pd.api.types.is_numeric_dtype(df_processed[col]):
df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
else:
df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
encoded_columns = []
le = LabelEncoder()
for col in df_processed.select_dtypes(include="object").columns:
df_processed[col] = le.fit_transform(df_processed[col])
encoded_columns.append(col)
# keep Numeric data
df_numeric = df_processed.select_dtypes(include="number")
# Remove constant columns
df_numeric = df_numeric.loc[:, df_numeric.nunique() > 1]
if df_numeric.shape[1] < 2:
return {
"message": "Not enough numeric columns for correlation"
}
# Always compute correlation
pearson_df = df_numeric.corr()
pearson_df = pearson_df.replace([np.inf, -np.inf], np.nan)
pearson_df = pearson_df.fillna(0)
pearson_corr = pearson_df.to_dict()
spearman_df = df_numeric.corr(method="spearman")
spearman_df = spearman_df.replace([np.inf, -np.inf], np.nan)
spearman_df = spearman_df.fillna(0)
spearman_corr = spearman_df.to_dict()
# EDA
if not target:
'''
cast(type, value) = “pretend this value is this type”
'''
return cast(Dict[str, Any], clean_data({
"mode": "eda",
"rows": df.shape[0],
"columns": df.shape[1],
"column_names": df.columns.to_list(),
"encoded_columns": encoded_columns,
"final_column_count": df_numeric.shape[1],
"pearson": pearson_corr,
"spearman": spearman_corr
}))
# ML MODE
if target not in df_processed.columns:
raise ValueError(f"Target column '{target}' not found")
# Encode target if needed
if not pd.api.types.is_numeric_dtype(df_processed[target]):
df_processed[target] = LabelEncoder().fit_transform(df_processed[target])
X = df_processed.drop(columns=[target]).select_dtypes(include="number")
y = df_processed[target]
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
feature_importance = {
col: round(float(imp), 4)
for col, imp in zip(X.columns, model.feature_importances_)
}
feature_importance = dict(
sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
)
return cast(Dict[str, Any], clean_data({
"mode": "ml",
"rows": df.shape[0],
"columns": df.shape[1],
"column_names": df.columns.to_list(),
"encoded_columns": encoded_columns,
"final_column_count": df_numeric.shape[1],
"pearson": pearson_corr,
"spearman": spearman_corr,
"feature_importance": dict(list(feature_importance.items())[:5]),
}))