import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from wordcloud import WordCloud class DataStoryteller: def __init__(self): pass def generate_story(self, data): story = "Data Story:\n\n" # Basic statistics story += self._generate_basic_stats(data) # Correlation analysis story += self._generate_correlation_analysis(data) # Trend analysis story += self._generate_trend_analysis(data) # Distribution analysis story += self._generate_distribution_analysis(data) return story def _generate_basic_stats(self, data): stats = data.describe() text = "Basic Statistics:\n" for column in stats.columns: text += f"\n{column}:\n" text += f" Mean: {stats[column]['mean']:.2f}\n" text += f" Median: {data[column].median():.2f}\n" text += f" Min: {stats[column]['min']:.2f}\n" text += f" Max: {stats[column]['max']:.2f}\n" return text def _generate_correlation_analysis(self, data): numeric_data = data.select_dtypes(include=[np.number]) corr_matrix = numeric_data.corr() text = "\nCorrelation Analysis:\n" for i in range(len(corr_matrix.columns)): for j in range(i+1, len(corr_matrix.columns)): col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j] corr = corr_matrix.loc[col1, col2] if abs(corr) > 0.5: text += f" Strong correlation between {col1} and {col2}: {corr:.2f}\n" return text def _generate_trend_analysis(self, data): text = "\nTrend Analysis:\n" for column in data.select_dtypes(include=[np.number]).columns: trend = np.polyfit(range(len(data)), data[column], 1)[0] if trend > 0: text += f" {column} shows an increasing trend.\n" elif trend < 0: text += f" {column} shows a decreasing trend.\n" else: text += f" {column} shows no significant trend.\n" return text def _generate_distribution_analysis(self, data): text = "\nDistribution Analysis:\n" for column in data.select_dtypes(include=[np.number]).columns: skewness = data[column].skew() if abs(skewness) < 0.5: text += f" {column} is approximately symmetrically distributed.\n" elif skewness > 0: text += f" {column} is right-skewed.\n" else: text += f" {column} is left-skewed.\n" return text def generate_word_cloud(self, data, text_column): text = " ".join(data[text_column].astype(str)) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title('Word Cloud') return plt def generate_summary_dashboard(self, data): fig, axs = plt.subplots(2, 2, figsize=(20, 15)) # Histogram sns.histplot(data=data, x=data.select_dtypes(include=[np.number]).columns[0], ax=axs[0, 0]) axs[0, 0].set_title('Distribution of ' + data.select_dtypes(include=[np.number]).columns[0]) # Scatter plot sns.scatterplot(data=data, x=data.select_dtypes(include=[np.number]).columns[0], y=data.select_dtypes(include=[np.number]).columns[1], ax=axs[0, 1]) axs[0, 1].set_title('Scatter Plot') # Box plot sns.boxplot(data=data, y=data.select_dtypes(include=[np.number]).columns[0], ax=axs[1, 0]) axs[1, 0].set_title('Box Plot') # Correlation heatmap sns.heatmap(data.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm', ax=axs[1, 1]) axs[1, 1].set_title('Correlation Heatmap') plt.tight_layout() return fig