saherPervaiz commited on
Commit
97a2e91
·
verified ·
1 Parent(s): 4b535f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -89
app.py CHANGED
@@ -1,10 +1,6 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import numpy as np
4
- import seaborn as sns
5
- import matplotlib.pyplot as plt
6
  from sklearn.model_selection import train_test_split
7
- from sklearn.impute import SimpleImputer
8
  from sklearn.preprocessing import LabelEncoder, StandardScaler
9
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
10
  from sklearn.linear_model import LogisticRegression, LinearRegression
@@ -12,76 +8,35 @@ from sklearn.svm import SVC, SVR
12
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
13
  from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
14
  from sklearn.naive_bayes import GaussianNB
15
- from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
16
- from scipy import stats
17
 
18
  # File uploader
19
- st.title("Data Analysis and Model Training")
20
  uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
21
 
22
  if uploaded_file is not None:
23
  df = pd.read_csv(uploaded_file)
24
 
25
- # Data Cleaning Function
26
- def clean_dataset(df):
27
- # Convert categorical data to numeric using Label Encoding (only for categorical with few unique values)
28
- le = LabelEncoder()
29
- for column in df.select_dtypes(include=['object']).columns:
30
- if df[column].nunique() < 10: # If the column has fewer unique values, encode it
31
- df[column] = le.fit_transform(df[column].astype(str))
32
-
33
- # Handle missing values (impute numerical columns with median and categorical columns with mode)
34
- categorical_columns = df.select_dtypes(include=['object']).columns
35
- if len(categorical_columns) > 0:
36
- imputer = SimpleImputer(strategy='most_frequent')
37
- df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
38
-
39
- # Handle numerical columns
40
- numerical_columns = df.select_dtypes(include=['number']).columns
41
- if len(numerical_columns) > 0:
42
- imputer = SimpleImputer(strategy='median')
43
- df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
44
-
45
- # Remove outliers (using z-score method)
46
- z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))
47
- df = df[(z_scores < 3).all(axis=1)]
48
-
49
- # Normalize numerical data
50
- scaler = StandardScaler()
51
- df[df.select_dtypes(include=['number']).columns] = scaler.fit_transform(df.select_dtypes(include=['number']))
52
-
53
- # Drop rows with any null values
54
- df = df.dropna()
55
-
56
- # Ensure that all columns are numeric before using in models
57
- for column in df.select_dtypes(include=['object']).columns:
58
- df[column] = pd.to_numeric(df[column], errors='coerce')
59
-
60
- return df
61
-
62
- # Apply the clean_dataset function
63
- df_cleaned = clean_dataset(df)
64
-
65
- # Show the cleaned dataset
66
- st.write("Cleaned Dataset:")
67
- st.dataframe(df_cleaned)
68
 
69
  # Model Training Section
70
  st.subheader("Model Training")
71
- if df_cleaned.empty:
72
- st.warning("The dataset is empty after cleaning. Please adjust your cleaning settings.")
73
  else:
74
- target = st.selectbox("Select Target Variable", df_cleaned.columns)
75
- features = [col for col in df_cleaned.columns if col != target]
76
- X = df_cleaned[features]
77
- y = df_cleaned[target]
78
 
79
  # Determine if the target is continuous or categorical
80
  is_classification = y.dtype == 'object' or len(y.unique()) <= 10 # If target is categorical or has few unique values, treat as classification
81
 
82
  # Ensure there is enough data before proceeding with train-test split
83
  if len(X) == 0 or len(y) == 0:
84
- st.warning("Insufficient data after cleaning. Please adjust the cleaning parameters.")
85
  else:
86
  # Split the data into training and test sets with customizable training size
87
  train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
@@ -143,38 +98,10 @@ if uploaded_file is not None:
143
  mime="text/csv"
144
  )
145
 
146
- # Option to download the cleaned dataset
147
  st.download_button(
148
- label="Download Cleaned Dataset",
149
- data=df_cleaned.to_csv(index=False),
150
- file_name="cleaned_dataset.csv",
151
  mime="text/csv"
152
  )
153
-
154
- # Download correlation heatmap
155
- st.subheader("Correlation Heatmap")
156
- correlation_matrix = df_cleaned.select_dtypes(include=['number']).corr()
157
- fig, ax = plt.subplots(figsize=(8, 6))
158
- sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
159
- st.pyplot(fig)
160
- fig.savefig("/tmp/correlation_heatmap.png")
161
- with open("/tmp/correlation_heatmap.png", "rb") as f:
162
- st.download_button(
163
- label="Download Correlation Heatmap",
164
- data=f,
165
- file_name="correlation_heatmap.png",
166
- mime="image/png"
167
- )
168
-
169
- # Pair plot of numerical columns to visualize relationships
170
- st.subheader("Pair Plot of Numerical Columns")
171
- pair_plot = sns.pairplot(df_cleaned[features]) # Generate pair plot for features
172
- st.pyplot(pair_plot)
173
- pair_plot.savefig("/tmp/pair_plot.png")
174
- with open("/tmp/pair_plot.png", "rb") as f:
175
- st.download_button(
176
- label="Download Pair Plot",
177
- data=f,
178
- file_name="pair_plot.png",
179
- mime="image/png"
180
- )
 
1
  import streamlit as st
2
  import pandas as pd
 
 
 
3
  from sklearn.model_selection import train_test_split
 
4
  from sklearn.preprocessing import LabelEncoder, StandardScaler
5
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
6
  from sklearn.linear_model import LogisticRegression, LinearRegression
 
8
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
9
  from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
10
  from sklearn.naive_bayes import GaussianNB
11
+ from sklearn.metrics import accuracy_score, mean_squared_error
 
12
 
13
  # File uploader
14
+ st.title("Model Training")
15
  uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
16
 
17
  if uploaded_file is not None:
18
  df = pd.read_csv(uploaded_file)
19
 
20
+ # Show the dataset
21
+ st.write("Dataset:")
22
+ st.dataframe(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Model Training Section
25
  st.subheader("Model Training")
26
+ if df.empty:
27
+ st.warning("The dataset is empty. Please upload a valid CSV file.")
28
  else:
29
+ target = st.selectbox("Select Target Variable", df.columns)
30
+ features = [col for col in df.columns if col != target]
31
+ X = df[features]
32
+ y = df[target]
33
 
34
  # Determine if the target is continuous or categorical
35
  is_classification = y.dtype == 'object' or len(y.unique()) <= 10 # If target is categorical or has few unique values, treat as classification
36
 
37
  # Ensure there is enough data before proceeding with train-test split
38
  if len(X) == 0 or len(y) == 0:
39
+ st.warning("Insufficient data. Please ensure there are valid feature and target columns.")
40
  else:
41
  # Split the data into training and test sets with customizable training size
42
  train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
 
98
  mime="text/csv"
99
  )
100
 
101
+ # Option to download the dataset
102
  st.download_button(
103
+ label="Download Dataset",
104
+ data=df.to_csv(index=False),
105
+ file_name="dataset.csv",
106
  mime="text/csv"
107
  )