import streamlit as st import pandas as pd import numpy as np import pickle # Load the k-means model from the pickle file with open("kmeans_model.pkl", "rb") as f: kmeans = pickle.load(f) # Define a function to preprocess the uploaded data def preprocess_data(data): # Drop the Zone column since it is all NaN data = data.drop(columns = "Zone") # Drop the rows with missing ID data = data.dropna(subset=['ID']) # Fill the other missing values with mode modes = data.mode().iloc[0] data.fillna(modes, inplace=True) # Select the relevant features X = data.drop(columns = ["ID","At Risk Rate", "Disabled Rate", "Total Amount Post Upfront Amount", "Gender"]) # Convert categorical features to codes cats = ["County", "Area", "Accounts Product Family"] X[cats] = X[cats].apply(lambda x: pd.Categorical(x).codes) # Apply log transformation to numerical features conts = ["Age", "Repayment Speed", "Total Amount Paid"] X[conts] = X[conts].apply(lambda x: np.log1p(x)) # Return the preprocessed data return X # Create a title for the app st.title("Anomaly Detection on Repayment Speed") # Create a sidebar for user input st.sidebar.header("Upload your data") # Allow the user to upload a file in csv or excel format uploaded_file = st.sidebar.file_uploader("Choose a file", type=["csv", "xlsx"]) # If the user uploads a file, display it and make a prediction if uploaded_file is not None: # Read the uploaded file as a dataframe if uploaded_file.type == "text/csv": data = pd.read_csv(uploaded_file) elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": data = pd.read_excel(uploaded_file) data["ID"] = data["ID"].astype(str) # Display the uploaded data st.subheader("Your data") st.write(data) # Preprocess the data X = preprocess_data(data) # Make a prediction using the k-means model prediction = kmeans.predict(X["Repayment Speed"].to_frame()) data_p = data.copy() data_p = data_p.dropna(subset=['ID']) data_p['Prediction'] = prediction data_p['Prediction'] = data_p['Prediction'].replace({0: 'slower', 1: 'faster', 2: 'medium'}) data_p = data_p[["ID", "Prediction"]] # Display the prediction st.subheader("prediction") st.write(data_p)