File size: 4,028 Bytes
1067825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
import math
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from typing import Any, Dict, cast

import math
import numpy as np

def clean_data(data):
    ''' 

      Make the API response JSON-safe.

      FastAPI cannot return NaN/Inf in JSON.

    '''
    if isinstance(data, dict):
        return {k: clean_data(v) for k, v in data.items()}

    elif isinstance(data, list):
        return [clean_data(v) for v in data]

    elif isinstance(data, (np.integer, np.floating)):
        value = float(data)
        if math.isnan(value) or math.isinf(value):
            return None
        return value

    elif isinstance(data, float):
        if math.isnan(data) or math.isinf(data):
            return None
        return data

    return data


def get_correlation(df, target)-> Dict[str, Any]:

  '''

  Says this returns a dictionary

  '''

  df_processed = df.copy()

  id_columns = [col for col in df_processed.columns if col.lower() == "id"]
  df_processed = df_processed.drop(columns=id_columns)

  for col in df_processed.columns:
      if pd.api.types.is_numeric_dtype(df_processed[col]):
          df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
      else:
          df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

  encoded_columns = []
  le = LabelEncoder()
  for col in df_processed.select_dtypes(include="object").columns:
      df_processed[col] = le.fit_transform(df_processed[col])
      encoded_columns.append(col)

  # keep Numeric data
  df_numeric = df_processed.select_dtypes(include="number")

  # Remove constant columns
  df_numeric = df_numeric.loc[:, df_numeric.nunique() > 1]

  if df_numeric.shape[1] < 2:
      return {
          "message": "Not enough numeric columns for correlation"
      }

  # Always compute correlation
  pearson_df = df_numeric.corr()
  pearson_df = pearson_df.replace([np.inf, -np.inf], np.nan)
  pearson_df = pearson_df.fillna(0)

  pearson_corr = pearson_df.to_dict()
  
  spearman_df = df_numeric.corr(method="spearman")
  spearman_df = spearman_df.replace([np.inf, -np.inf], np.nan)
  spearman_df = spearman_df.fillna(0)

  spearman_corr = spearman_df.to_dict()

  # EDA

  if not target:
      '''

      cast(type, value) = “pretend this value is this type”

      '''
      return cast(Dict[str, Any], clean_data({
            "mode": "eda",
            "rows": df.shape[0],
            "columns": df.shape[1],
            "column_names": df.columns.to_list(),
            "encoded_columns": encoded_columns,
            "final_column_count": df_numeric.shape[1],
            "pearson": pearson_corr,
            "spearman": spearman_corr
        }))

  # ML MODE 
  
  if target not in df_processed.columns:
      raise ValueError(f"Target column '{target}' not found")

  # Encode target if needed
  if not pd.api.types.is_numeric_dtype(df_processed[target]):
      df_processed[target] = LabelEncoder().fit_transform(df_processed[target])

  X = df_processed.drop(columns=[target]).select_dtypes(include="number")
  y = df_processed[target]

  model = RandomForestClassifier(n_estimators=100, random_state=42)
  model.fit(X, y)

  feature_importance = {
      col: round(float(imp), 4)
      for col, imp in zip(X.columns, model.feature_importances_)
  }

  feature_importance = dict(
      sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
  )

  return cast(Dict[str, Any], clean_data({
        "mode": "ml",
        "rows": df.shape[0],
        "columns": df.shape[1],
        "column_names": df.columns.to_list(),
        "encoded_columns": encoded_columns,
        "final_column_count": df_numeric.shape[1],
        "pearson": pearson_corr,
        "spearman": spearman_corr,
        "feature_importance": dict(list(feature_importance.items())[:5]),
    }))