saherPervaiz commited on
Commit
c033d78
·
verified ·
1 Parent(s): 2925188

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -14
app.py CHANGED
@@ -29,24 +29,23 @@ if uploaded_file is not None:
29
  for column in df.select_dtypes(include=['object']).columns:
30
  if df[column].nunique() < 10: # If the column has fewer unique values, encode it
31
  df[column] = le.fit_transform(df[column].astype(str))
32
-
33
  # Handle missing values (impute numerical columns with median and categorical columns with mode)
34
- # Handle categorical columns
35
  categorical_columns = df.select_dtypes(include=['object']).columns
36
  if len(categorical_columns) > 0:
37
  imputer = SimpleImputer(strategy='most_frequent')
38
  df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
39
-
40
  # Handle numerical columns
41
  numerical_columns = df.select_dtypes(include=['number']).columns
42
  if len(numerical_columns) > 0:
43
  imputer = SimpleImputer(strategy='median')
44
  df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
45
-
46
  # Remove outliers (using z-score method)
47
  z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))
48
- df = df[(z_scores < 3).all(axis=1)] # Removing rows where any column exceeds a Z-score of 3
49
-
50
  # Normalize numerical data
51
  scaler = StandardScaler()
52
  df[df.select_dtypes(include=['number']).columns] = scaler.fit_transform(df.select_dtypes(include=['number']))
@@ -57,16 +56,16 @@ if uploaded_file is not None:
57
  # Ensure that all columns are numeric before using in models
58
  for column in df.select_dtypes(include=['object']).columns:
59
  df[column] = pd.to_numeric(df[column], errors='coerce')
60
-
61
  return df
62
 
63
  # Apply the clean_dataset function
64
  df_cleaned = clean_dataset(df)
65
-
66
  # Show the cleaned dataset
67
  st.write("Cleaned Dataset:")
68
  st.dataframe(df_cleaned)
69
-
70
  # Model Training Section
71
  st.subheader("Model Training")
72
  if df_cleaned.empty:
@@ -76,10 +75,10 @@ if uploaded_file is not None:
76
  features = [col for col in df_cleaned.columns if col != target]
77
  X = df_cleaned[features]
78
  y = df_cleaned[target]
79
-
80
  # Determine if the target is continuous or categorical
81
  is_classification = len(y.unique()) <= 10 # If target has fewer than or equal to 10 unique values, treat as classification
82
-
83
  # Ensure there is enough data before proceeding with train-test split
84
  if len(X) == 0 or len(y) == 0:
85
  st.warning("Insufficient data after cleaning. Please adjust the cleaning parameters.")
@@ -87,7 +86,7 @@ if uploaded_file is not None:
87
  # Split the data into training and test sets with customizable training size
88
  train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
89
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
90
-
91
  # Store results in a dictionary
92
  results = []
93
 
@@ -122,8 +121,7 @@ if uploaded_file is not None:
122
  model.fit(X_train, y_train)
123
  y_pred = model.predict(X_test)
124
  mse = mean_squared_error(y_test, y_pred)
125
- accuracy = accuracy_score(y_test, y_pred)
126
- results.append([name, accuracy, mse])
127
 
128
  # Display results in a table
129
  st.subheader("Model Performance Results")
 
29
  for column in df.select_dtypes(include=['object']).columns:
30
  if df[column].nunique() < 10: # If the column has fewer unique values, encode it
31
  df[column] = le.fit_transform(df[column].astype(str))
32
+
33
  # Handle missing values (impute numerical columns with median and categorical columns with mode)
 
34
  categorical_columns = df.select_dtypes(include=['object']).columns
35
  if len(categorical_columns) > 0:
36
  imputer = SimpleImputer(strategy='most_frequent')
37
  df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
38
+
39
  # Handle numerical columns
40
  numerical_columns = df.select_dtypes(include=['number']).columns
41
  if len(numerical_columns) > 0:
42
  imputer = SimpleImputer(strategy='median')
43
  df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
44
+
45
  # Remove outliers (using z-score method)
46
  z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))
47
+ df = df[(z_scores < 3).all(axis=1)]
48
+
49
  # Normalize numerical data
50
  scaler = StandardScaler()
51
  df[df.select_dtypes(include=['number']).columns] = scaler.fit_transform(df.select_dtypes(include=['number']))
 
56
  # Ensure that all columns are numeric before using in models
57
  for column in df.select_dtypes(include=['object']).columns:
58
  df[column] = pd.to_numeric(df[column], errors='coerce')
59
+
60
  return df
61
 
62
  # Apply the clean_dataset function
63
  df_cleaned = clean_dataset(df)
64
+
65
  # Show the cleaned dataset
66
  st.write("Cleaned Dataset:")
67
  st.dataframe(df_cleaned)
68
+
69
  # Model Training Section
70
  st.subheader("Model Training")
71
  if df_cleaned.empty:
 
75
  features = [col for col in df_cleaned.columns if col != target]
76
  X = df_cleaned[features]
77
  y = df_cleaned[target]
78
+
79
  # Determine if the target is continuous or categorical
80
  is_classification = len(y.unique()) <= 10 # If target has fewer than or equal to 10 unique values, treat as classification
81
+
82
  # Ensure there is enough data before proceeding with train-test split
83
  if len(X) == 0 or len(y) == 0:
84
  st.warning("Insufficient data after cleaning. Please adjust the cleaning parameters.")
 
86
  # Split the data into training and test sets with customizable training size
87
  train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
88
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
89
+
90
  # Store results in a dictionary
91
  results = []
92
 
 
121
  model.fit(X_train, y_train)
122
  y_pred = model.predict(X_test)
123
  mse = mean_squared_error(y_test, y_pred)
124
+ results.append([name, None, mse])
 
125
 
126
  # Display results in a table
127
  st.subheader("Model Performance Results")