Spaces:
Runtime error
Runtime error
| import nltk | |
| import seaborn as sns | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import scipy.stats as stats | |
| from sklearn.decomposition import PCA | |
| from wordcloud import WordCloud | |
| from sklearn.metrics import confusion_matrix | |
| from nltk import regexp_tokenize | |
| # Single attribute visualization | |
| def distribution_histogram(df, attribute): | |
| """ | |
| Histogram of the distribution of a single attribute. | |
| """ | |
| if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]): | |
| codes, uniques = pd.factorize(df[attribute]) | |
| temp_df = pd.DataFrame({attribute: codes}) | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| sns.histplot(temp_df[attribute], ax=ax, discrete=True, color='#e17160') | |
| ax.set_xticks(range(len(uniques))) | |
| ax.set_xticklabels(uniques, rotation=45, ha='right') | |
| else: | |
| fig, ax = plt.subplots(figsize=(6, 4)) | |
| sns.histplot(df[attribute], ax=ax, color='#e17160') | |
| ax.set_title(f"Distribution of {attribute}") | |
| return fig | |
| def distribution_boxplot(df, attribute): | |
| """ | |
| Boxplot of the distribution of a single attribute. | |
| """ | |
| if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]): | |
| return -1 | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| sns.boxenplot(data=df[attribute], palette=["#32936f", "#26a96c", "#2bc016"]) | |
| ax.set_title(f"Boxplot of {attribute}") | |
| return fig | |
| def count_Y(df, Y_name): | |
| """ | |
| Donut chart of the distribution of a single attribute. | |
| """ | |
| if Y_name in df.columns and df[Y_name].nunique() >= 1: | |
| value_counts = df[Y_name].value_counts() | |
| fig = px.pie(names=value_counts.index, | |
| values=value_counts.values, | |
| title=f'Distribution of {Y_name}', | |
| hole=0.5, | |
| color_discrete_sequence=px.colors.sequential.Cividis_r) | |
| return fig | |
| def density_plot(df, column_name): | |
| """ | |
| Density plot of the distribution of a single attribute. | |
| """ | |
| if column_name in df.columns: | |
| fig = px.density_contour(df, x=column_name, y=column_name, | |
| title=f'Density Plot of {column_name}', | |
| color_discrete_sequence=px.colors.sequential.Inferno) | |
| return fig | |
| # Mutiple attribute visualization | |
| def box_plot(df, column_names): | |
| """ | |
| Box plot of multiple attributes. | |
| """ | |
| if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
| return -1 | |
| valid_columns = [col for col in column_names if col in df.columns] | |
| if valid_columns: | |
| fig = px.box(df, y=valid_columns, | |
| title=f'Box Plot of {", ".join(valid_columns)}', | |
| color_discrete_sequence=px.colors.sequential.Cividis_r) | |
| return fig | |
| def violin_plot(df, column_names): | |
| """ | |
| Violin plot of multiple attributes. | |
| """ | |
| if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
| return -1 | |
| valid_columns = [col for col in column_names if col in df.columns] | |
| if valid_columns: | |
| fig = px.violin(df, y=valid_columns, | |
| title=f'Violin Plot of {", ".join(valid_columns)}', | |
| color_discrete_sequence=px.colors.sequential.Cividis_r) | |
| return fig | |
| def strip_plot(df, column_names): | |
| """ | |
| Strip plot of multiple attributes. | |
| """ | |
| if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
| return -1 | |
| valid_columns = [col for col in column_names if col in df.columns] | |
| if valid_columns: | |
| fig = px.strip(df, y=valid_columns, | |
| title=f'Strip Plot of {", ".join(valid_columns)}', | |
| color_discrete_sequence=px.colors.sequential.Cividis_r) | |
| return fig | |
| def multi_plot_scatter(df, selected_attributes): | |
| """ | |
| Scatter plot of multiple attributes. | |
| """ | |
| if len(selected_attributes) < 2: | |
| return -1 | |
| plt.figure(figsize=(10, 6)) | |
| if df[selected_attributes[0]].dtype not in [np.float64, np.int64]: | |
| x, x_labels = pd.factorize(df[selected_attributes[0]]) | |
| plt.xticks(ticks=np.arange(len(x_labels)), labels=x_labels, rotation=45) | |
| else: | |
| x = df[selected_attributes[0]] | |
| if df[selected_attributes[1]].dtype not in [np.float64, np.int64]: | |
| y, y_labels = pd.factorize(df[selected_attributes[1]]) | |
| plt.yticks(ticks=np.arange(len(y_labels)), labels=y_labels) | |
| else: | |
| y = df[selected_attributes[1]] | |
| plt.scatter(x, y, c=np.linspace(0, 1, len(df)), cmap='viridis') | |
| plt.colorbar() | |
| plt.xlabel(selected_attributes[0]) | |
| plt.ylabel(selected_attributes[1]) | |
| plt.title(f'Scatter Plot of {selected_attributes[0]} vs {selected_attributes[1]}') | |
| return plt.gcf() | |
| def multi_plot_line(df, selected_attributes): | |
| """ | |
| Line plot of multiple attributes. | |
| """ | |
| if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
| return -1 | |
| if len(selected_attributes) >= 2: | |
| plt.figure(figsize=(10, 6)) | |
| colors = plt.cm.viridis(np.linspace(0, 1, len(selected_attributes))) | |
| for i, attribute in enumerate(selected_attributes): | |
| plt.plot(df.index, df[attribute], marker='', linewidth=2, color=colors[i], label=attribute) | |
| plt.legend() | |
| plt.xlabel(selected_attributes[0]) | |
| plt.ylabel(selected_attributes[1]) | |
| plt.title(f'Line Plot of {selected_attributes[0]} vs {selected_attributes[1]}') | |
| return plt.gcf() | |
| else: | |
| return -2 | |
| def multi_plot_heatmap(df, selected_attributes): | |
| """ | |
| Correlation heatmap of multiple attributes. | |
| """ | |
| if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
| return -1 | |
| if len(selected_attributes) >= 1: | |
| sns.set_theme() | |
| plt.figure(figsize=(10, 8)) | |
| sns.heatmap(df[selected_attributes].corr(), annot=True, cmap='viridis') | |
| plt.title('Heatmap of Correlation') | |
| return plt.gcf() | |
| # Overall visualization | |
| def correlation_matrix(df): | |
| """ | |
| Correlation heatmap of all attributes using Seaborn. | |
| """ | |
| plt.figure(figsize=(16, 12)) | |
| sns.set(font_scale=0.9) | |
| sns.heatmap(df.corr(), annot=True, cmap='viridis', annot_kws={"size": 12}) | |
| return plt.gcf() | |
| def correlation_matrix_plotly(df): | |
| """ | |
| Correlation heatmap of all attributes using Plotly. | |
| """ | |
| corr_matrix = df.corr() | |
| labels = corr_matrix.columns | |
| text = [[f'{corr_matrix.iloc[i, j]:.2f}' for j in range(len(labels))] for i in range(len(labels))] | |
| fig = go.Figure(data=go.Heatmap( | |
| z=corr_matrix.values, | |
| x=labels, | |
| y=labels, | |
| colorscale='Viridis', | |
| colorbar=dict(title='Correlation'), | |
| text=text, | |
| hoverinfo='text', | |
| )) | |
| fig.update_layout( | |
| title='Correlation Matrix Between Attributes', | |
| xaxis=dict(tickmode='linear'), | |
| yaxis=dict(tickmode='linear'), | |
| width=800, | |
| height=700, | |
| ) | |
| fig.update_layout(font=dict(size=10)) | |
| return fig | |
| def list_all(df, max_plots=16): | |
| """ | |
| Display histograms of all attributes in the DataFrame. | |
| """ | |
| # Calculate the number of plots to display (up to 16) | |
| num_plots = min(len(df.columns), max_plots) | |
| nrows = int(np.ceil(num_plots / 4)) | |
| ncols = min(num_plots, 4) | |
| fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows)) | |
| fig.suptitle('Attribute Distributions', fontsize=20) | |
| plt.style.use('ggplot') | |
| sns.set(style="darkgrid") | |
| # if only one plot, convert to list | |
| if num_plots == 1: axes = [axes] | |
| # Flatten the axes array | |
| axes = axes.flatten() | |
| # Display the histograms | |
| for i, column in enumerate(df.columns[:num_plots]): | |
| sns.histplot(ax=axes[i], data=df, x=column, color='#1867ac') | |
| # Hide additional subplots | |
| for ax in axes[num_plots:]: ax.axis('off') | |
| plt.tight_layout() | |
| plt.subplots_adjust(top=0.95) # Adjust the top to accommodate the title | |
| return fig | |
| # Model evaluation | |
| def confusion_metrix(model_name, model, X_test, Y_test): | |
| """ | |
| Confusion matrix plot for classification models | |
| """ | |
| Y_pred = model.predict(X_test) | |
| matrix = confusion_matrix(Y_test, Y_pred) | |
| plt.figure(figsize=(10, 7)) # temporary | |
| sns_heatmap = sns.heatmap(matrix, annot=True, cmap='Blues', fmt='g', annot_kws={"size": 20}) | |
| plt.title(f"Confusion Matrix for {model_name}", fontsize=20) | |
| plt.xlabel('Predicted labels', fontsize=16) | |
| plt.ylabel('True labels', fontsize=16) | |
| return sns_heatmap.figure | |
| def roc(model_name, fpr, tpr): | |
| """ | |
| ROC curve for classification models | |
| """ | |
| fig = plt.figure() | |
| plt.style.use('ggplot') | |
| plt.plot([0,1],[0,1],'k--') | |
| plt.plot(fpr, tpr, label=model_name) | |
| plt.xlabel('False Positive rate') | |
| plt.ylabel('True Positive rate') | |
| plt.title(f'ROC Curve - {model_name}') | |
| plt.legend(loc='best') | |
| plt.xticks(rotation=45) | |
| return fig | |
| def plot_clusters(X, labels): | |
| """ | |
| Scatter plot of clusters for clustering models | |
| """ | |
| sns.set(style="whitegrid") | |
| pca = PCA(n_components=2) | |
| X_pca = pca.fit_transform(X) | |
| unique_labels = set(labels) | |
| colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels))) | |
| fig, ax = plt.subplots() | |
| for color, label in zip(colors, unique_labels): | |
| idx = labels == label | |
| ax.scatter(X_pca[idx, 0], X_pca[idx, 1], color=color, label=f'Cluster {label}', s=50) | |
| ax.set_title('Cluster Scatter Plot') | |
| ax.legend() | |
| return fig | |
| def plot_residuals(y_pred, Y_test): | |
| """ | |
| Residual plot for regression models | |
| """ | |
| residuals = Y_test - y_pred | |
| fig, ax = plt.subplots() | |
| sns.residplot(x=y_pred, y=residuals, lowess=True, ax=ax, scatter_kws={'alpha': 0.7}, line_kws={'color': 'purple', 'lw': 2}) | |
| ax.set_xlabel('Predicted Values') | |
| ax.set_ylabel('Residuals') | |
| ax.set_title('Residual Plot') | |
| return fig | |
| def plot_predictions_vs_actual(y_pred, Y_test): | |
| """ | |
| Scatter plot of predicted vs. actual values for regression models | |
| """ | |
| fig, ax = plt.subplots() | |
| ax.scatter(Y_test, y_pred, c='#10a37f', marker='x') | |
| ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=2) | |
| ax.set_xlabel('Actual') | |
| ax.set_ylabel('Predicted') | |
| ax.set_title('Actual vs. Predicted') | |
| ax.set_facecolor('white') | |
| ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray') | |
| ax.spines['top'].set_visible(False) | |
| ax.spines['right'].set_visible(False) | |
| return fig | |
| def plot_qq_plot(y_pred, Y_test): | |
| """ | |
| Quantile-Quantile plot for regression models | |
| """ | |
| residuals = Y_test - y_pred | |
| fig, ax = plt.subplots() | |
| (osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm", plot=None) | |
| line = slope * osm + intercept | |
| ax.plot(osm, line, 'grey', lw=2) | |
| ax.scatter(osm, osr, alpha=0.8, edgecolors='#e8b517', c='yellow', label='Data Points') | |
| ax.set_title('Quantile-Quantile Plot') | |
| ax.set_facecolor('white') | |
| ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray') | |
| ax.spines['top'].set_visible(False) | |
| ax.spines['right'].set_visible(False) | |
| ax.set_xlabel('Theoretical Quantiles') | |
| ax.set_ylabel('Ordered Values') | |
| return fig | |
| # Advanced Visualization | |
| def word_cloud_plot(text): | |
| """ | |
| Generates and displays a word cloud from the given text. | |
| The word cloud visualizes the frequency of occurrence of words in the text, with the size of each word indicating its frequency. | |
| :param text: The input text from which to generate the word cloud. | |
| :return: A matplotlib figure object containing the word cloud if successful, -1 otherwise. | |
| """ | |
| try: | |
| words = regexp_tokenize(text, pattern='\w+') | |
| text_dist = nltk.FreqDist([w for w in words]) | |
| wordcloud = WordCloud(width=1200, height=600, background_color ='white').generate_from_frequencies(text_dist) | |
| fig, ax = plt.subplots(figsize=(10, 7.5)) | |
| ax.imshow(wordcloud, interpolation='bilinear') | |
| ax.axis('off') | |
| return fig | |
| except: | |
| return -1 | |
| def world_map(df, country_column, key_attribute): | |
| """ | |
| Creates a choropleth world map visualization based on the specified DataFrame. | |
| The function highlights countries based on a key attribute, providing an interactive map that can be used to analyze geographical data distributions. | |
| :param df: DataFrame containing the data to be visualized. | |
| :param country_column: Name of the column in df that contains country names. | |
| :param key_attribute: Name of the column in df that contains the data to visualize on the map. | |
| :return: A Plotly figure object representing the choropleth map if successful, -1 otherwise. | |
| """ | |
| try: | |
| hover_data_columns = [col for col in df.columns if col != country_column] | |
| fig = px.choropleth(df, locations="iso_alpha", | |
| color=key_attribute, | |
| hover_name=country_column, | |
| hover_data=hover_data_columns, | |
| color_continuous_scale=px.colors.sequential.Cividis, | |
| projection="equirectangular",) | |
| return fig | |
| except: | |
| return -1 | |
| def scatter_3d(df, x, y, z): | |
| """ | |
| Generates a 3D scatter plot from the given DataFrame. | |
| Each point in the plot corresponds to a row in the DataFrame, with its position determined by three specified columns. Points are colored based on the values of the z-axis. | |
| :param df: DataFrame containing the data to be visualized. | |
| :param x: Name of the column in df to use for the x-axis values. | |
| :param y: Name of the column in df to use for the y-axis values. | |
| :param z: Name of the column in df to use for the z-axis values and color coding. | |
| :return: A Plotly figure object containing the 3D scatter plot if successful, -1 otherwise. | |
| """ | |
| try: | |
| return px.scatter_3d(df, x=x, y=y, z=z, color=z, color_continuous_scale=px.colors.sequential.Viridis) | |
| except: | |
| return -1 | |