Spaces:
Runtime error
Runtime error
| """ | |
| Exploratory Data Analysis (EDA) module for the awesome-chatgpt-prompts dataset. | |
| Generates visualizations for dataset exploration. | |
| """ | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import pandas as pd | |
| import numpy as np | |
| from datasets import load_dataset | |
| from collections import Counter | |
| import re | |
| # Set style for matplotlib | |
| plt.style.use('seaborn-v0_8-darkgrid') | |
| sns.set_palette("husl") | |
| # Dataset configuration | |
| DATASET_NAME = "fka/awesome-chatgpt-prompts" | |
| def load_prompts_dataset() -> pd.DataFrame: | |
| """ | |
| Load the awesome-chatgpt-prompts dataset from HuggingFace. | |
| Returns: | |
| DataFrame with the prompts data | |
| """ | |
| dataset = load_dataset(DATASET_NAME, split="train") | |
| df = dataset.to_pandas() | |
| # Add computed columns for analysis | |
| df['prompt_length'] = df['prompt'].apply(len) | |
| df['word_count'] = df['prompt'].apply(lambda x: len(x.split())) | |
| df['sentence_count'] = df['prompt'].apply(lambda x: len(re.split(r'[.!?]+', x))) | |
| return df | |
| def create_prompt_length_histogram() -> go.Figure: | |
| """ | |
| Create an interactive histogram of prompt lengths. | |
| Returns: | |
| Plotly Figure object | |
| """ | |
| df = load_prompts_dataset() | |
| fig = px.histogram( | |
| df, | |
| x='prompt_length', | |
| nbins=50, | |
| title='π Distribution of Prompt Lengths (Characters)', | |
| labels={'prompt_length': 'Prompt Length (characters)', 'count': 'Frequency'}, | |
| color_discrete_sequence=['#667eea'] | |
| ) | |
| fig.update_layout( | |
| template='plotly_dark', | |
| title_font_size=20, | |
| title_x=0.5, | |
| showlegend=False, | |
| xaxis_title="Character Count", | |
| yaxis_title="Number of Prompts" | |
| ) | |
| # Add mean line | |
| mean_length = df['prompt_length'].mean() | |
| fig.add_vline( | |
| x=mean_length, | |
| line_dash="dash", | |
| line_color="#ff6b6b", | |
| annotation_text=f"Mean: {mean_length:.0f}", | |
| annotation_position="top" | |
| ) | |
| return fig | |
| def create_word_count_boxplot() -> go.Figure: | |
| """ | |
| Create a boxplot showing word count distribution. | |
| Returns: | |
| Plotly Figure object | |
| """ | |
| df = load_prompts_dataset() | |
| fig = go.Figure() | |
| fig.add_trace(go.Box( | |
| y=df['word_count'], | |
| name='Word Count', | |
| marker_color='#764ba2', | |
| boxmean='sd' | |
| )) | |
| fig.update_layout( | |
| title='π Word Count Distribution', | |
| template='plotly_dark', | |
| title_font_size=20, | |
| title_x=0.5, | |
| yaxis_title="Words per Prompt", | |
| showlegend=False | |
| ) | |
| return fig | |
| def create_top_words_chart(top_n: int = 20) -> go.Figure: | |
| """ | |
| Create a bar chart of the most common words in prompts. | |
| Args: | |
| top_n: Number of top words to display | |
| Returns: | |
| Plotly Figure object | |
| """ | |
| df = load_prompts_dataset() | |
| # Tokenize and count words (excluding common stop words) | |
| stop_words = { | |
| 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', | |
| 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', | |
| 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', | |
| 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', | |
| 'used', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', | |
| 'who', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both', | |
| 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', | |
| 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'also', | |
| 'your', 'my', 'this', 'that', 'these', 'those', 'me', 'him', 'her', | |
| 'us', 'them', 'if', 'then', 'else', 'while', 'about', 'into', 'through', | |
| 'during', 'before', 'after', 'above', 'below', 'between', 'any', 'their' | |
| } | |
| all_words = [] | |
| for prompt in df['prompt']: | |
| words = re.findall(r'\b[a-zA-Z]+\b', prompt.lower()) | |
| all_words.extend([w for w in words if w not in stop_words and len(w) > 2]) | |
| word_counts = Counter(all_words).most_common(top_n) | |
| words, counts = zip(*word_counts) | |
| fig = go.Figure(go.Bar( | |
| x=list(counts)[::-1], | |
| y=list(words)[::-1], | |
| orientation='h', | |
| marker=dict( | |
| color=list(range(top_n)), | |
| colorscale='Viridis' | |
| ) | |
| )) | |
| fig.update_layout( | |
| title=f'π€ Top {top_n} Most Common Words', | |
| template='plotly_dark', | |
| title_font_size=20, | |
| title_x=0.5, | |
| xaxis_title="Frequency", | |
| yaxis_title="Word", | |
| height=600 | |
| ) | |
| return fig | |
| def create_length_vs_words_scatter() -> go.Figure: | |
| """ | |
| Create a scatter plot showing relationship between length and word count. | |
| Returns: | |
| Plotly Figure object | |
| """ | |
| df = load_prompts_dataset() | |
| fig = px.scatter( | |
| df, | |
| x='word_count', | |
| y='prompt_length', | |
| title='π Prompt Length vs Word Count', | |
| labels={ | |
| 'word_count': 'Word Count', | |
| 'prompt_length': 'Character Length' | |
| }, | |
| color='sentence_count', | |
| color_continuous_scale='Plasma', | |
| hover_data=['act'] | |
| ) | |
| fig.update_layout( | |
| template='plotly_dark', | |
| title_font_size=20, | |
| title_x=0.5 | |
| ) | |
| return fig | |
| def create_summary_stats() -> pd.DataFrame: | |
| """ | |
| Create a summary statistics table for the dataset. | |
| Returns: | |
| DataFrame with summary statistics | |
| """ | |
| df = load_prompts_dataset() | |
| stats = { | |
| 'Metric': [ | |
| 'Total Prompts', | |
| 'Average Length (chars)', | |
| 'Average Word Count', | |
| 'Max Length (chars)', | |
| 'Min Length (chars)', | |
| 'Median Word Count' | |
| ], | |
| 'Value': [ | |
| len(df), | |
| f"{df['prompt_length'].mean():.0f}", | |
| f"{df['word_count'].mean():.1f}", | |
| df['prompt_length'].max(), | |
| df['prompt_length'].min(), | |
| f"{df['word_count'].median():.0f}" | |
| ] | |
| } | |
| return pd.DataFrame(stats) | |
| def create_category_distribution() -> go.Figure: | |
| """ | |
| Create a pie chart showing distribution of prompt categories (acts). | |
| Returns: | |
| Plotly Figure object | |
| """ | |
| df = load_prompts_dataset() | |
| # Get top 10 categories | |
| top_acts = df['act'].value_counts().head(10) | |
| fig = go.Figure(go.Pie( | |
| labels=top_acts.index, | |
| values=top_acts.values, | |
| hole=0.4, | |
| marker=dict(colors=px.colors.qualitative.Set3) | |
| )) | |
| fig.update_layout( | |
| title='π Top 10 Prompt Categories (Acts)', | |
| template='plotly_dark', | |
| title_font_size=20, | |
| title_x=0.5 | |
| ) | |
| return fig | |
| def get_all_eda_figures() -> dict: | |
| """ | |
| Generate all EDA figures at once. | |
| Returns: | |
| Dictionary containing all figure objects | |
| """ | |
| return { | |
| 'prompt_length': create_prompt_length_histogram(), | |
| 'word_count': create_word_count_boxplot(), | |
| 'top_words': create_top_words_chart(), | |
| 'scatter': create_length_vs_words_scatter(), | |
| 'categories': create_category_distribution() | |
| } | |