Spaces:
Runtime error
Runtime error
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import matplotlib as mpl | |
| import pycaret | |
| import streamlit as st | |
| from streamlit_option_menu import option_menu | |
| import PIL | |
| from PIL import Image | |
| from PIL import ImageColor | |
| from PIL import ImageDraw | |
| from PIL import ImageFont | |
| def main(): | |
| st.set_page_config(layout="wide") | |
| hide_streamlit_style = """ | |
| <style> | |
| #MainMenu {visibility: hidden;} | |
| footer {visibility: hidden;} | |
| </style> | |
| """ | |
| st.markdown(hide_streamlit_style, unsafe_allow_html=True) | |
| with st.sidebar: | |
| image = Image.open('itaca_logo.png') | |
| st.image(image, width=150) #,use_column_width=True) | |
| page = option_menu(menu_title='Menu', | |
| menu_icon="robot", | |
| options=["Clustering Analysis", | |
| "Anomaly Detection"], | |
| icons=["chat-dots", | |
| "key"], | |
| default_index=0 | |
| ) | |
| # Additional section below the option menu | |
| # st.markdown("---") # Add a separator line | |
| st.header("Settings") | |
| num_lines = st.number_input("% of lines to be processed:", min_value=0, max_value=100, value=100) | |
| graph_select = st.checkbox("Show Graphics", value= True) | |
| feat_imp_select = st.checkbox("Feature Importance", value= False) | |
| # Define the options for the dropdown list | |
| numclusters = [2, 3, 4, 5, 6] | |
| selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4) | |
| p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False) | |
| p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9) | |
| # p_remove_outliers = st.checkbox("Remove Outliers", value=False) | |
| # p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"]) | |
| p_transformation = st.checkbox("Choose Power Transform", value = False) | |
| p_normalize = st.checkbox("Choose Normalize", value = False) | |
| p_pca = st.checkbox("Choose PCA", value = False) | |
| p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"]) | |
| st.title('ITACA Insurance Core AI Module') | |
| #col1, col2 = st.columns(2) | |
| if page == "Clustering Analysis": | |
| #with col1: | |
| st.header('Clustering Analysis') | |
| st.write( | |
| """ | |
| """ | |
| ) | |
| # import pycaret unsupervised models | |
| from pycaret.clustering import setup, create_model, assign_model, pull, plot_model | |
| # import ClusteringExperiment | |
| from pycaret.clustering import ClusteringExperiment | |
| # Display the list of CSV files | |
| directory = "./" | |
| all_files = os.listdir(directory) | |
| # Filter files to only include CSV files | |
| csv_files = [file for file in all_files if file.endswith(".csv")] | |
| # Select a CSV file from the list | |
| selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) | |
| # Upload the CSV file | |
| uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
| # Define the unsupervised model | |
| clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch'] | |
| selected_model = st.selectbox("Choose a clustering model", clusteringmodel) | |
| # Read and display the CSV file | |
| if selected_csv != "None" or uploaded_file is not None: | |
| if uploaded_file: | |
| try: | |
| delimiter = ',' | |
| insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) | |
| except ValueError: | |
| delimiter = '|' | |
| insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') | |
| else: | |
| insurance_claims = pd.read_csv(selected_csv) | |
| num_rows = int(insurance_claims.shape[0]*(num_lines)/100) | |
| insurance_claims_reduced = insurance_claims.head(num_rows) | |
| st.write("Rows to be processed: " + str(num_rows)) | |
| all_columns = insurance_claims_reduced.columns.tolist() | |
| selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) | |
| insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() | |
| with st.expander("Inference Description", expanded=True): | |
| insurance_claims_reduced.describe().T | |
| with st.expander("Head Map", expanded=True): | |
| cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns | |
| num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns | |
| # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4)) | |
| # Calculate the correlation matrix | |
| corr_matrix = insurance_claims_reduced[num_col].corr() | |
| # Create a Matplotlib figure | |
| fig, ax = plt.subplots(figsize=(12, 8)) | |
| # Create a heatmap using seaborn | |
| #st.header("Heat Map") | |
| sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax) | |
| # Set the title for the heatmap | |
| ax.set_title('Correlation Heatmap') | |
| # Display the heatmap in Streamlit | |
| st.pyplot(fig) | |
| if st.button("Prediction"): | |
| #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() | |
| s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, | |
| # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method, | |
| transformation=p_transformation, | |
| normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) | |
| exp_clustering = ClusteringExperiment() | |
| # init setup on exp | |
| exp_clustering.setup(insurance_claims_reduced, session_id = 123) | |
| with st.spinner("Analyzing..."): | |
| #with col2: | |
| #st.markdown("<br><br><br><br>", unsafe_allow_html=True) | |
| # train kmeans model | |
| cluster_model = create_model(selected_model, num_clusters = selected_clusters) | |
| cluster_model_2 = assign_model(cluster_model) | |
| # Calculate summary statistics for each cluster | |
| cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max', | |
| 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)), | |
| ('quantile_75', lambda x: x.quantile(0.75)), 'skew']) | |
| with st.expander("Cluster Summary", expanded=False): | |
| #st.header("Cluster Summary") | |
| cluster_summary | |
| with st.expander("Model Assign", expanded=False): | |
| #st.header("Assign Model") | |
| cluster_model_2 | |
| # all_metrics = get_metrics() | |
| # all_metrics | |
| with st.expander("Clustering Metrics", expanded=False): | |
| #st.header("Clustering Metrics") | |
| cluster_results = pull() | |
| cluster_results | |
| with st.expander("Clustering Plots", expanded=False): | |
| if graph_select: | |
| #st.header("Clustering Plots") | |
| # plot pca cluster plot | |
| plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit') | |
| if selected_model != 'ap': | |
| plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit') | |
| if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'): | |
| plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit') | |
| if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'): | |
| plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit') | |
| if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'): | |
| plot_model(cluster_model, plot = 'distance', display_format = 'streamlit') | |
| if selected_model != 'ap': | |
| plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit') | |
| with st.expander("Feature Importance", expanded=False): | |
| # Create a Classification Model to extract feature importance | |
| if graph_select and feat_imp_select: | |
| #st.header("Feature Importance") | |
| from pycaret.classification import setup, create_model, get_config | |
| s = setup(cluster_model_2, target = 'Cluster') | |
| lr = create_model('lr') | |
| # this is how you can recreate the table | |
| feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False) | |
| # sort by feature importance value and filter top 10 | |
| feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10) | |
| # Display the filtered table in Streamlit | |
| # st.dataframe(feat_imp) | |
| # Display the filtered table as a bar chart in Streamlit | |
| st.bar_chart(feat_imp.set_index('Feature')) | |
| elif page == "Anomaly Detection": | |
| #with col1: | |
| st.header('Anomaly Detection') | |
| st.write( | |
| """ | |
| """ | |
| ) | |
| # import pycaret anomaly | |
| from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model | |
| # import AnomalyExperiment | |
| from pycaret.anomaly import AnomalyExperiment | |
| # Display the list of CSV files | |
| directory = "./" | |
| all_files = os.listdir(directory) | |
| # Filter files to only include CSV files | |
| csv_files = [file for file in all_files if file.endswith(".csv")] | |
| # Select a CSV file from the list | |
| selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) | |
| # Upload the CSV file | |
| uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
| # Define the unsupervised model | |
| anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos'] | |
| selected_model = st.selectbox("Choose an anomaly model", anomalymodel) | |
| # Read and display the CSV file | |
| if selected_csv != "None" or uploaded_file is not None: | |
| if uploaded_file: | |
| try: | |
| delimiter = ',' | |
| insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) | |
| except ValueError: | |
| delimiter = '|' | |
| insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') | |
| else: | |
| insurance_claims = pd.read_csv(selected_csv) | |
| num_rows = int(insurance_claims.shape[0]*(num_lines)/100) | |
| insurance_claims_reduced = insurance_claims.head(num_rows) | |
| st.write("Rows to be processed: " + str(num_rows)) | |
| all_columns = insurance_claims_reduced.columns.tolist() | |
| selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) | |
| insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() | |
| with st.expander("Inference Description", expanded=True): | |
| insurance_claims_reduced.describe().T | |
| with st.expander("Head Map", expanded=True): | |
| cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns | |
| num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns | |
| # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4)) | |
| # Calculate the correlation matrix | |
| corr_matrix = insurance_claims_reduced[num_col].corr() | |
| # Create a Matplotlib figure | |
| fig, ax = plt.subplots(figsize=(12, 8)) | |
| # Create a heatmap using seaborn | |
| #st.header("Heat Map") | |
| sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax) | |
| # Set the title for the heatmap | |
| ax.set_title('Correlation Heatmap') | |
| # Display the heatmap in Streamlit | |
| st.pyplot(fig) | |
| if st.button("Prediction"): | |
| s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, | |
| # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method, | |
| transformation=p_transformation, | |
| normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) | |
| exp_anomaly = AnomalyExperiment() | |
| # init setup on exp | |
| exp_anomaly.setup(insurance_claims_reduced, session_id = 123) | |
| with st.spinner("Analyzing..."): | |
| #with col2: | |
| #st.markdown("<br><br><br><br>", unsafe_allow_html=True) | |
| # train model | |
| anomaly_model = create_model(selected_model) | |
| with st.expander("Assign Model", expanded=False): | |
| #st.header("Assign Model") | |
| anomaly_model_2 = assign_model(anomaly_model) | |
| anomaly_model_2 | |
| with st.expander("Anomaly Metrics", expanded=False): | |
| #st.header("Anomaly Metrics") | |
| anomaly_results = pull() | |
| anomaly_results | |
| with st.expander("Anomaly Plots", expanded=False): | |
| if graph_select: | |
| # plot | |
| #st.header("Anomaly Plots") | |
| plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit') | |
| plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit') | |
| with st.expander("Feature Importance", expanded=False): | |
| if graph_select and feat_imp_select: | |
| # Create a Classification Model to extract feature importance | |
| #st.header("Feature Importance") | |
| from pycaret.classification import setup, create_model, get_config | |
| s = setup(anomaly_model_2, target = 'Anomaly') | |
| lr = create_model('lr') | |
| # this is how you can recreate the table | |
| feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False) | |
| # sort by feature importance value and filter top 10 | |
| feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10) | |
| # Display the filtered table in Streamlit | |
| # st.dataframe(feat_imp) | |
| # Display the filtered table as a bar chart in Streamlit | |
| st.bar_chart(feat_imp.set_index('Feature')) | |
| try: | |
| main() | |
| except Exception as e: | |
| st.sidebar.error(f"An error occurred: {e}") |