saherPervaiz commited on
Commit
2925188
·
verified ·
1 Parent(s): ae471c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -14
app.py CHANGED
@@ -29,25 +29,24 @@ if uploaded_file is not None:
29
  for column in df.select_dtypes(include=['object']).columns:
30
  if df[column].nunique() < 10: # If the column has fewer unique values, encode it
31
  df[column] = le.fit_transform(df[column].astype(str))
32
-
33
  # Handle missing values (impute numerical columns with median and categorical columns with mode)
34
-
35
  # Handle categorical columns
36
  categorical_columns = df.select_dtypes(include=['object']).columns
37
  if len(categorical_columns) > 0:
38
  imputer = SimpleImputer(strategy='most_frequent')
39
  df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
40
-
41
  # Handle numerical columns
42
  numerical_columns = df.select_dtypes(include=['number']).columns
43
  if len(numerical_columns) > 0:
44
  imputer = SimpleImputer(strategy='median')
45
  df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
46
-
47
  # Remove outliers (using z-score method)
48
  z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))
49
- df = df[(z_scores < 3).all(axis=1)]
50
-
51
  # Normalize numerical data
52
  scaler = StandardScaler()
53
  df[df.select_dtypes(include=['number']).columns] = scaler.fit_transform(df.select_dtypes(include=['number']))
@@ -58,16 +57,16 @@ if uploaded_file is not None:
58
  # Ensure that all columns are numeric before using in models
59
  for column in df.select_dtypes(include=['object']).columns:
60
  df[column] = pd.to_numeric(df[column], errors='coerce')
61
-
62
  return df
63
 
64
  # Apply the clean_dataset function
65
  df_cleaned = clean_dataset(df)
66
-
67
  # Show the cleaned dataset
68
  st.write("Cleaned Dataset:")
69
  st.dataframe(df_cleaned)
70
-
71
  # Model Training Section
72
  st.subheader("Model Training")
73
  if df_cleaned.empty:
@@ -77,10 +76,10 @@ if uploaded_file is not None:
77
  features = [col for col in df_cleaned.columns if col != target]
78
  X = df_cleaned[features]
79
  y = df_cleaned[target]
80
-
81
  # Determine if the target is continuous or categorical
82
  is_classification = len(y.unique()) <= 10 # If target has fewer than or equal to 10 unique values, treat as classification
83
-
84
  # Ensure there is enough data before proceeding with train-test split
85
  if len(X) == 0 or len(y) == 0:
86
  st.warning("Insufficient data after cleaning. Please adjust the cleaning parameters.")
@@ -88,12 +87,11 @@ if uploaded_file is not None:
88
  # Split the data into training and test sets with customizable training size
89
  train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
90
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
91
-
92
  # Store results in a dictionary
93
  results = []
94
 
95
  # Model Selection and Evaluation
96
- models = []
97
  if is_classification:
98
  model_choices = [
99
  ("Random Forest", RandomForestClassifier(n_estimators=50)),
@@ -124,7 +122,8 @@ if uploaded_file is not None:
124
  model.fit(X_train, y_train)
125
  y_pred = model.predict(X_test)
126
  mse = mean_squared_error(y_test, y_pred)
127
- results.append([name, None, mse])
 
128
 
129
  # Display results in a table
130
  st.subheader("Model Performance Results")
 
29
  for column in df.select_dtypes(include=['object']).columns:
30
  if df[column].nunique() < 10: # If the column has fewer unique values, encode it
31
  df[column] = le.fit_transform(df[column].astype(str))
32
+
33
  # Handle missing values (impute numerical columns with median and categorical columns with mode)
 
34
  # Handle categorical columns
35
  categorical_columns = df.select_dtypes(include=['object']).columns
36
  if len(categorical_columns) > 0:
37
  imputer = SimpleImputer(strategy='most_frequent')
38
  df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
39
+
40
  # Handle numerical columns
41
  numerical_columns = df.select_dtypes(include=['number']).columns
42
  if len(numerical_columns) > 0:
43
  imputer = SimpleImputer(strategy='median')
44
  df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
45
+
46
  # Remove outliers (using z-score method)
47
  z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))
48
+ df = df[(z_scores < 3).all(axis=1)] # Removing rows where any column exceeds a Z-score of 3
49
+
50
  # Normalize numerical data
51
  scaler = StandardScaler()
52
  df[df.select_dtypes(include=['number']).columns] = scaler.fit_transform(df.select_dtypes(include=['number']))
 
57
  # Ensure that all columns are numeric before using in models
58
  for column in df.select_dtypes(include=['object']).columns:
59
  df[column] = pd.to_numeric(df[column], errors='coerce')
60
+
61
  return df
62
 
63
  # Apply the clean_dataset function
64
  df_cleaned = clean_dataset(df)
65
+
66
  # Show the cleaned dataset
67
  st.write("Cleaned Dataset:")
68
  st.dataframe(df_cleaned)
69
+
70
  # Model Training Section
71
  st.subheader("Model Training")
72
  if df_cleaned.empty:
 
76
  features = [col for col in df_cleaned.columns if col != target]
77
  X = df_cleaned[features]
78
  y = df_cleaned[target]
79
+
80
  # Determine if the target is continuous or categorical
81
  is_classification = len(y.unique()) <= 10 # If target has fewer than or equal to 10 unique values, treat as classification
82
+
83
  # Ensure there is enough data before proceeding with train-test split
84
  if len(X) == 0 or len(y) == 0:
85
  st.warning("Insufficient data after cleaning. Please adjust the cleaning parameters.")
 
87
  # Split the data into training and test sets with customizable training size
88
  train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
89
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
90
+
91
  # Store results in a dictionary
92
  results = []
93
 
94
  # Model Selection and Evaluation
 
95
  if is_classification:
96
  model_choices = [
97
  ("Random Forest", RandomForestClassifier(n_estimators=50)),
 
122
  model.fit(X_train, y_train)
123
  y_pred = model.predict(X_test)
124
  mse = mean_squared_error(y_test, y_pred)
125
+ accuracy = accuracy_score(y_test, y_pred)
126
+ results.append([name, accuracy, mse])
127
 
128
  # Display results in a table
129
  st.subheader("Model Performance Results")