""" Exploratory Data Analysis (EDA) module for the awesome-chatgpt-prompts dataset. Generates visualizations for dataset exploration. """ import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import pandas as pd import numpy as np from datasets import load_dataset from collections import Counter import re # Set style for matplotlib plt.style.use('seaborn-v0_8-darkgrid') sns.set_palette("husl") # Dataset configuration DATASET_NAME = "fka/awesome-chatgpt-prompts" def load_prompts_dataset() -> pd.DataFrame: """ Load the awesome-chatgpt-prompts dataset from HuggingFace. Returns: DataFrame with the prompts data """ dataset = load_dataset(DATASET_NAME, split="train") df = dataset.to_pandas() # Add computed columns for analysis df['prompt_length'] = df['prompt'].apply(len) df['word_count'] = df['prompt'].apply(lambda x: len(x.split())) df['sentence_count'] = df['prompt'].apply(lambda x: len(re.split(r'[.!?]+', x))) return df def create_prompt_length_histogram() -> go.Figure: """ Create an interactive histogram of prompt lengths. Returns: Plotly Figure object """ df = load_prompts_dataset() fig = px.histogram( df, x='prompt_length', nbins=50, title='📏 Distribution of Prompt Lengths (Characters)', labels={'prompt_length': 'Prompt Length (characters)', 'count': 'Frequency'}, color_discrete_sequence=['#667eea'] ) fig.update_layout( template='plotly_dark', title_font_size=20, title_x=0.5, showlegend=False, xaxis_title="Character Count", yaxis_title="Number of Prompts" ) # Add mean line mean_length = df['prompt_length'].mean() fig.add_vline( x=mean_length, line_dash="dash", line_color="#ff6b6b", annotation_text=f"Mean: {mean_length:.0f}", annotation_position="top" ) return fig def create_word_count_boxplot() -> go.Figure: """ Create a boxplot showing word count distribution. Returns: Plotly Figure object """ df = load_prompts_dataset() fig = go.Figure() fig.add_trace(go.Box( y=df['word_count'], name='Word Count', marker_color='#764ba2', boxmean='sd' )) fig.update_layout( title='📊 Word Count Distribution', template='plotly_dark', title_font_size=20, title_x=0.5, yaxis_title="Words per Prompt", showlegend=False ) return fig def create_top_words_chart(top_n: int = 20) -> go.Figure: """ Create a bar chart of the most common words in prompts. Args: top_n: Number of top words to display Returns: Plotly Figure object """ df = load_prompts_dataset() # Tokenize and count words (excluding common stop words) stop_words = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'also', 'your', 'my', 'this', 'that', 'these', 'those', 'me', 'him', 'her', 'us', 'them', 'if', 'then', 'else', 'while', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'any', 'their' } all_words = [] for prompt in df['prompt']: words = re.findall(r'\b[a-zA-Z]+\b', prompt.lower()) all_words.extend([w for w in words if w not in stop_words and len(w) > 2]) word_counts = Counter(all_words).most_common(top_n) words, counts = zip(*word_counts) fig = go.Figure(go.Bar( x=list(counts)[::-1], y=list(words)[::-1], orientation='h', marker=dict( color=list(range(top_n)), colorscale='Viridis' ) )) fig.update_layout( title=f'🔤 Top {top_n} Most Common Words', template='plotly_dark', title_font_size=20, title_x=0.5, xaxis_title="Frequency", yaxis_title="Word", height=600 ) return fig def create_length_vs_words_scatter() -> go.Figure: """ Create a scatter plot showing relationship between length and word count. Returns: Plotly Figure object """ df = load_prompts_dataset() fig = px.scatter( df, x='word_count', y='prompt_length', title='📈 Prompt Length vs Word Count', labels={ 'word_count': 'Word Count', 'prompt_length': 'Character Length' }, color='sentence_count', color_continuous_scale='Plasma', hover_data=['act'] ) fig.update_layout( template='plotly_dark', title_font_size=20, title_x=0.5 ) return fig def create_summary_stats() -> pd.DataFrame: """ Create a summary statistics table for the dataset. Returns: DataFrame with summary statistics """ df = load_prompts_dataset() stats = { 'Metric': [ 'Total Prompts', 'Average Length (chars)', 'Average Word Count', 'Max Length (chars)', 'Min Length (chars)', 'Median Word Count' ], 'Value': [ len(df), f"{df['prompt_length'].mean():.0f}", f"{df['word_count'].mean():.1f}", df['prompt_length'].max(), df['prompt_length'].min(), f"{df['word_count'].median():.0f}" ] } return pd.DataFrame(stats) def create_category_distribution() -> go.Figure: """ Create a pie chart showing distribution of prompt categories (acts). Returns: Plotly Figure object """ df = load_prompts_dataset() # Get top 10 categories top_acts = df['act'].value_counts().head(10) fig = go.Figure(go.Pie( labels=top_acts.index, values=top_acts.values, hole=0.4, marker=dict(colors=px.colors.qualitative.Set3) )) fig.update_layout( title='🎭 Top 10 Prompt Categories (Acts)', template='plotly_dark', title_font_size=20, title_x=0.5 ) return fig def get_all_eda_figures() -> dict: """ Generate all EDA figures at once. Returns: Dictionary containing all figure objects """ return { 'prompt_length': create_prompt_length_histogram(), 'word_count': create_word_count_boxplot(), 'top_words': create_top_words_chart(), 'scatter': create_length_vs_words_scatter(), 'categories': create_category_distribution() }