Spaces:

1MR
/

ARAG

Sleeping

App Files Files Community

1MR commited on Dec 22, 2024

Commit

395ef11

verified ·

1 Parent(s): 587e047

Update Preprocessing2.py

Browse files

Files changed (1) hide show

Preprocessing2.py +212 -217

Preprocessing2.py CHANGED Viewed

@@ -1,217 +1,212 @@
-import streamlit as st
-import pandas as pd
-import numpy as np
-import io
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import LabelEncoder
-import seaborn as sns
-import base64
-def handle_categorical_values():
-    if "data" in st.session_state:
-        data = st.session_state["data"]
-        st.subheader("Handle Categorical Values")
-        categorical_cols_features = list(
-            data.select_dtypes(include="object").columns)
-        # One-Hot Encoding for nominal categorical features
-        one_hot_enc = st.multiselect(
-            "Select nominal categorical columns", categorical_cols_features)
-        # Apply one-hot encoding to selected columns
-        if one_hot_enc:
-            for column in one_hot_enc:
-                if data[column].dtype == 'object':  # Only apply to categorical/string columns
-                    data = pd.get_dummies(data, columns=[column])
-            st.write("### Data after One-Hot Encoding:")
-            st.write(data.head())
-        # Label Encoding for ordinal categorical features
-        label_encoder = LabelEncoder()
-        label_enc = st.multiselect(
-            "Select ordinal categorical columns", categorical_cols_features)
-        # Apply label encoding to selected columns
-        if label_enc:
-            for column in label_enc:
-                if data[column].dtype == 'object':  # Only apply to categorical/string columns
-                    data[column] = label_encoder.fit_transform(data[column])
-            st.write("### Data after Label Encoding:")
-            st.write(data.head())
-    else:
-        st.warning("Please upload a dataset to handle categorical values.")
-def missing_values():
-    st.title("Handle Missing Values")
-    if "data" in st.session_state:
-        data = st.session_state["data"].copy()
-        action = st.selectbox(
-            "Select Action", ["Drop", "Dropna", "Fill missing val"])
-        column = st.selectbox("Select Column", data.columns)
-        st.write("### Before:")
-        st.dataframe(data)
-        modified_data = data.copy()
-        if action == "Drop":
-            modified_data.drop(columns=[column], inplace=True)
-        elif action == "Dropna":
-            modified_data.dropna(subset=[column], inplace=True)
-        elif action == "Fill missing val":
-            fill_method = st.selectbox(
-                "Select fill method", ["Mean", "Mode", "Median"])
-            if fill_method == "Mean":
-                fill_value = data[column].mean()
-            elif fill_method == "Mode":
-                fill_value = data[column].mode()[0]
-            elif fill_method == "Median":
-                fill_value = data[column].median()
-            modified_data[column].fillna(fill_value, inplace=True)
-        st.write("### After (Preview):")
-        st.dataframe(modified_data)
-        if st.button("OK"):
-            st.session_state["data"] = modified_data
-            st.success("Done! The action has been applied.")
-            st.write("### After:")
-            st.dataframe(modified_data)
-    else:
-        st.warning("Please upload a dataset first.")
-def handle_duplicates():
-    st.title("Handle Duplicates")
-    if "data" in st.session_state:
-        data = st.session_state["data"].copy()
-        action = st.selectbox(
-            "Select Action", ["Drop Duplicates", "Drop Duplicates in Column", "Keep First", "Keep Last"])
-        if action in ["Drop Duplicates in Column", "Keep First", "Keep Last"]:
-            column = st.selectbox("Select Column", data.columns)
-        else:
-            column = None
-        st.write("### Before:")
-        st.dataframe(data)
-        after_placeholder = st.empty()
-        modified_data = data.copy()
-        if action == "Drop Duplicates":
-            modified_data.drop_duplicates(inplace=True)
-        elif action == "Drop Duplicates in Column":
-            modified_data.drop_duplicates(subset=[column], inplace=True)
-        elif action == "Keep First":
-            modified_data.drop_duplicates(
-                subset=[column], keep="first", inplace=True)
-        elif action == "Keep Last":
-            modified_data.drop_duplicates(
-                subset=[column], keep="last", inplace=True)
-        st.write("### After (Preview):")
-        st.dataframe(modified_data)
-        if st.button("OK"):
-            st.session_state["data"] = modified_data
-            st.success("Done! The action has been applied.")
-            st.write("### After:")
-            st.dataframe(modified_data)
-    else:
-        st.warning("Please upload a dataset first.")
-def handle_outliers():
-    st.title("Handle Outliers")
-    if "data" in st.session_state:
-        data = st.session_state["data"].copy()
-        column = st.selectbox("Select Column", data.select_dtypes(
-            include=[np.number]).columns)
-        action = st.selectbox(
-            "Select Action",
-            ["Remove Outliers (IQR)", "Set Bounds Manually",
-             "Replace Outliers"]
-        )
-        st.write("### Before:")
-        st.dataframe(data)
-        after_placeholder = st.empty()
-        modified_data = data.copy()
-        if action == "Remove Outliers (IQR)":
-            Q1 = data[column].quantile(0.25)
-            Q3 = data[column].quantile(0.75)
-            IQR = Q3 - Q1
-            lower_bound = Q1 - 1.5 * IQR
-            upper_bound = Q3 + 1.5 * IQR
-            # Remove outliers
-            modified_data = modified_data[(
-                modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
-        elif action == "Set Bounds Manually":
-            # User inputs for bounds
-            lower_bound = st.number_input(
-                f"Set lower bound for {column}", value=float(data[column].min()))
-            upper_bound = st.number_input(
-                f"Set upper bound for {column}", value=float(data[column].max()))
-            modified_data = modified_data[(
-                modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
-        elif action == "Replace Outliers":
-            Q1 = data[column].quantile(0.25)
-            Q3 = data[column].quantile(0.75)
-            IQR = Q3 - Q1
-            lower_bound = Q1 - 1.5 * IQR
-            upper_bound = Q3 + 1.5 * IQR
-            replace_method = st.radio(
-                "Select Replacement Method",
-                ["Mean", "Median"]
-            )
-            if replace_method == "Mean":
-                replacement_value = data[column].mean()
-            else:
-                replacement_value = data[column].median()
-            # Replace outliers
-            modified_data[column] = modified_data[column].apply(
-                lambda x: replacement_value if x < lower_bound or x > upper_bound else x
-            )
-        # After Visualization
-        st.write("### After (Preview):")
-        st.dataframe(modified_data)
-        if st.button("OK"):
-            st.session_state["data"] = modified_data
-            st.success("Done! The action has been applied.")
-            st.write("### After:")
-            st.dataframe(modified_data)
-    else:
-        st.warning("Please upload a dataset first.")

+import streamlit as st
+import pandas as pd
+import numpy as np
+import io
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder
+import seaborn as sns
+import base64
+def handle_categorical_values():
+    if "data" in st.session_state:
+        data = st.session_state["data"]
+        st.subheader("Handle Categorical Values")
+        categorical_cols_features = list(data.select_dtypes(include="object").columns)
+        one_hot_enc = st.multiselect("Select nominal categorical columns", categorical_cols_features)
+        if one_hot_enc:
+            for column in one_hot_enc:
+                if data[column].dtype == 'object':
+                    data = pd.get_dummies(data, columns=[column])
+            st.session_state["data"] = data
+            st.write("### Data after One-Hot Encoding:")
+            st.write(data.head())
+        label_encoder = LabelEncoder()
+        label_enc = st.multiselect("Select ordinal categorical columns", categorical_cols_features)
+        if label_enc:
+            for column in label_enc:
+                if data[column].dtype == 'object':
+                    data[column] = label_encoder.fit_transform(data[column])
+            st.session_state["data"] = data
+            st.write("### Data after Label Encoding:")
+            st.write(data.head())
+    else:
+        st.warning("Please upload a dataset to handle categorical values.")
+def missing_values():
+    st.title("Handle Missing Values")
+    if "data" in st.session_state:
+        data = st.session_state["data"].copy()
+        action = st.selectbox(
+            "Select Action", ["Drop", "Dropna", "Fill missing val"])
+        column = st.selectbox("Select Column", data.columns)
+        st.write("### Before:")
+        st.dataframe(data)
+        modified_data = data.copy()
+        if action == "Drop":
+            modified_data.drop(columns=[column], inplace=True)
+        elif action == "Dropna":
+            modified_data.dropna(subset=[column], inplace=True)
+        elif action == "Fill missing val":
+            fill_method = st.selectbox(
+                "Select fill method", ["Mean", "Mode", "Median"])
+            if fill_method == "Mean":
+                fill_value = data[column].mean()
+            elif fill_method == "Mode":
+                fill_value = data[column].mode()[0]
+            elif fill_method == "Median":
+                fill_value = data[column].median()
+            modified_data[column].fillna(fill_value, inplace=True)
+        st.write("### After (Preview):")
+        st.dataframe(modified_data)
+        if st.button("OK"):
+            st.session_state["data"] = modified_data
+            st.success("Done! The action has been applied.")
+            st.write("### After:")
+            st.dataframe(modified_data)
+    else:
+        st.warning("Please upload a dataset first.")
+def handle_duplicates():
+    st.title("Handle Duplicates")
+    if "data" in st.session_state:
+        data = st.session_state["data"].copy()
+        action = st.selectbox(
+            "Select Action", ["Drop Duplicates", "Drop Duplicates in Column", "Keep First", "Keep Last"])
+        if action in ["Drop Duplicates in Column", "Keep First", "Keep Last"]:
+            column = st.selectbox("Select Column", data.columns)
+        else:
+            column = None
+        st.write("### Before:")
+        st.dataframe(data)
+        after_placeholder = st.empty()
+        modified_data = data.copy()
+        if action == "Drop Duplicates":
+            modified_data.drop_duplicates(inplace=True)
+        elif action == "Drop Duplicates in Column":
+            modified_data.drop_duplicates(subset=[column], inplace=True)
+        elif action == "Keep First":
+            modified_data.drop_duplicates(
+                subset=[column], keep="first", inplace=True)
+        elif action == "Keep Last":
+            modified_data.drop_duplicates(
+                subset=[column], keep="last", inplace=True)
+        st.write("### After (Preview):")
+        st.dataframe(modified_data)
+        if st.button("OK"):
+            st.session_state["data"] = modified_data
+            st.success("Done! The action has been applied.")
+            st.write("### After:")
+            st.dataframe(modified_data)
+    else:
+        st.warning("Please upload a dataset first.")
+def handle_outliers():
+    st.title("Handle Outliers")
+    if "data" in st.session_state:
+        data = st.session_state["data"].copy()
+        column = st.selectbox("Select Column", data.select_dtypes(
+            include=[np.number]).columns)
+        action = st.selectbox(
+            "Select Action",
+            ["Remove Outliers (IQR)", "Set Bounds Manually",
+             "Replace Outliers"]
+        )
+        st.write("### Before:")
+        st.dataframe(data)
+        after_placeholder = st.empty()
+        modified_data = data.copy()
+        if action == "Remove Outliers (IQR)":
+            Q1 = data[column].quantile(0.25)
+            Q3 = data[column].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            # Remove outliers
+            modified_data = modified_data[(
+                modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
+        elif action == "Set Bounds Manually":
+            # User inputs for bounds
+            lower_bound = st.number_input(
+                f"Set lower bound for {column}", value=float(data[column].min()))
+            upper_bound = st.number_input(
+                f"Set upper bound for {column}", value=float(data[column].max()))
+            modified_data = modified_data[(
+                modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
+        elif action == "Replace Outliers":
+            Q1 = data[column].quantile(0.25)
+            Q3 = data[column].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            replace_method = st.radio(
+                "Select Replacement Method",
+                ["Mean", "Median"]
+            )
+            if replace_method == "Mean":
+                replacement_value = data[column].mean()
+            else:
+                replacement_value = data[column].median()
+            # Replace outliers
+            modified_data[column] = modified_data[column].apply(
+                lambda x: replacement_value if x < lower_bound or x > upper_bound else x
+            )
+        # After Visualization
+        st.write("### After (Preview):")
+        st.dataframe(modified_data)
+        if st.button("OK"):
+            st.session_state["data"] = modified_data
+            st.success("Done! The action has been applied.")
+            st.write("### After:")
+            st.dataframe(modified_data)
+    else:
+        st.warning("Please upload a dataset first.")