# _____________ Import Python Libraries _________________ # import streamlit as st import numpy as np import plotly.express as px import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline # ________________ Page Configuration Section _____________ # st.set_page_config( page_title="Data Ocean", page_icon= '🔥' ) # _________________ Web Page Info Section _____________________ # st.title(":red[Data] :blue[Analytic] :orange[Portal & Machine Learning]") st.header(":rainbow[Explore Data With Ease]") # __________________ File Upload Section _________________ # file = st.file_uploader('Drop Your CSV, Excel', type=['csv', 'xlsx']) if file is not None: try: if file.name.endswith('csv'): data = pd.read_csv(file) elif file.name.endswith('xlsx'): data = pd.read_excel(file) else: pass st.dataframe(data) st.success("File Successfully Uploaded" ,icon='🎉') # ________________ Basic Info Summary Section ______________ # st.subheader(':rainbow[Basic Information of The Dataset]',divider='violet') tab1, tab2, tab3, tab4 ,tab5 , tab6 = st.tabs(['Summary', 'Top & Bottom Rows', 'Data Types', 'Columns','Missing Values','Duplicates Value']) with tab1: st.write(f'There are {data.shape[0]} Rows and {data.shape[1]} Columns in The Dataset') st.subheader(':blue[Statistical Summary]') st.dataframe(data.describe()) with tab2: st.subheader(':gray[Top Rows]') top_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='topslider') st.dataframe(data.head(top_rows)) st.subheader(':green[Bottom Rows]') bottom_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='bottomslider') st.dataframe(data.tail(bottom_rows)) with tab3: st.subheader(':orange[Data Types]') st.write(data.dtypes.tolist()) with tab4: st.subheader(':green[Columns]') st.write(data.columns.tolist()) with tab5: st.subheader(':red[Missing Values]') missing_values = data.isnull().sum() st.dataframe(missing_values) if missing_values.sum() > 0: remove_tab, fill_tab = st.tabs(['Remove Missing Values', 'Fill Missing Values']) with remove_tab: if st.checkbox("Remove Rows with Missing Values"): data = data.dropna(inplace=True) st.success('Rows with missing values removed!', icon="🎉") with fill_tab: replace_nulls = st.selectbox('Replace Missing Values With:', ['None', 'Mean', 'Median', 'Mode']) if replace_nulls != 'None': for col in data.select_dtypes(include=[np.number]): if replace_nulls == 'Mean': data[col].fillna(data[col].mean(), inplace=True) elif replace_nulls == 'Median': data[col].fillna(data[col].median(), inplace=True) elif replace_nulls == 'Mode': data[col].fillna(data[col].mode()[0], inplace=True) st.success("Missing values replaced successfully!", icon='✅') else: st.success("No missing values detected.", icon='🔥') with tab6: st.subheader(':green[Duplicate Values]') duplicates = data.duplicated().sum() if duplicates ==0: st.info(f' No Duplicates Value Found',icon='🔥') if duplicates > 0 and st.checkbox('Remove Duplicates'): data = data.drop_duplicates() st.success('Duplicate rows removed!', icon='🔥') # __________________ Value Count Section _____________________ # st.subheader(':rainbow[Column Value Count]',divider='green') with st.expander('Value Count'): col1, col2 = st.columns(2) with col1: column = st.selectbox('Choose Column Name', options=[None] + data.columns.tolist()) with col2: toprows = st.number_input('Number of Top Rows', min_value=1, step=1, value=5) if column: result = data[column].value_counts().reset_index().head(toprows) result.columns = [column, 'count'] st.dataframe(result) if not result.empty: fig = px.bar(data_frame=result, x=column, y='count', template='plotly_white') st.plotly_chart(fig) fig = px.line(data_frame=result, x=column, y='count') st.plotly_chart(fig) fig = px.pie(data_frame=result, names=column, values='count') st.plotly_chart(fig) # ______________ GroupBy Section _________________________ # st.subheader(':blue[Groupby : Simplify Your Data Analysis]',divider='violet') st.write("Groupby allows you to summarize data by categories.") with st.expander('Group By Your Columns'): col1, col2, col3 = st.columns(3) with col1: groupby_cols = st.multiselect('Choose Columns to Group By', options=data.columns.tolist()) with col2: operation_col = st.selectbox("Choose Column for Operation", options=data.columns.tolist()) with col3: operation = st.selectbox("Choose Operation", options=['sum', 'max', 'min', 'count', 'mean', 'median']) if groupby_cols and operation_col and operation: result = data.groupby(groupby_cols).agg(newcol=(operation_col, operation)).reset_index() st.dataframe(result) st.subheader(':rainbow[Data Visualization]') graph_type = st.selectbox('Choose Graph Type', options=['line', 'bar', 'scatter', 'pie', 'sunburst']) if graph_type == 'line': x_axis = st.selectbox('X Axis', options=result.columns.tolist()) y_axis = st.selectbox('Y Axis', options=result.columns.tolist()) fig = px.line(data_frame=result, x=x_axis, y=y_axis) st.plotly_chart(fig) elif graph_type == 'bar': x_axis = st.selectbox('X Axis', options=result.columns.tolist()) y_axis = st.selectbox('Y Axis', options=result.columns.tolist()) color = st.selectbox('Color Information', options=[None] + result.columns.tolist()) fig = px.bar(data_frame=result, x=x_axis, y=y_axis, color=color) st.plotly_chart(fig) elif graph_type == 'pie': values = st.selectbox("Numerical Values", options=result.columns.tolist()) names = st.selectbox('Labels', options=result.columns.tolist()) fig = px.pie(data_frame=result, names=names, values=values) st.plotly_chart(fig) elif graph_type == 'scatter': x_axis = st.selectbox('X Axis', options=result.columns.tolist()) y_axis = st.selectbox('Y Axis', options=result.columns.tolist()) size = st.selectbox('Size Column', options=[None] + result.columns.tolist()) color = st.selectbox('Color Information', options=[None] + result.columns.tolist()) fig = px.scatter(data_frame=result, x=x_axis, y=y_axis, color=color, size=size) st.plotly_chart(fig) elif graph_type == 'sunburst': path = st.multiselect('Path', options=result.columns.tolist()) fig = px.sunburst(data_frame=result, path=path, values='newcol') st.plotly_chart(fig) #_________________ Machine Learning_______________ # st.subheader(":orange[Basic Machine Learning]",divider='green') ml_task = st.selectbox("Select ML Task", ["None", "SVM", "Logistic Regression", "Decision Tree", "K-Nearest Neighbors"]) if ml_task != "None": target_col = st.selectbox("Select Target Column", data.columns) feature_cols = st.multiselect("Select Feature Columns", data.columns) if target_col and feature_cols: X = data[feature_cols] y = data[target_col] # Handle Preprocessing (Categorical and Numeric Data) numeric_features = X.select_dtypes(include=['int64', 'float64']).columns categorical_features = X.select_dtypes(include=['object']).columns numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), # Handle missing data ('scaler', StandardScaler()) # Normalize numerical data ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Handle missing data ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-Hot Encode categorical features ]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ] ) # Create model pipeline based on selected task if ml_task == "SVM": model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC())]) elif ml_task == "Logistic Regression": model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) elif ml_task == "Decision Tree": model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier())]) elif ml_task == "K-Nearest Neighbors": model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', KNeighborsClassifier())]) # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train the model model.fit(X_train, y_train) y_pred = model.predict(X_test) # Evaluate the model accuracy = accuracy_score(y_test, y_pred) st.write(f"Model Accuracy: {accuracy * 100:.2f}%") except Exception as e: st.error(f"An error occurred: {e}")