Spaces:

ohmp
/

EDA-Generator

Runtime error

File size: 7,345 Bytes

043c791

"""
Exploratory Data Analysis (EDA) module for the awesome-chatgpt-prompts dataset.
Generates visualizations for dataset exploration.
"""

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from datasets import load_dataset
from collections import Counter
import re

# Set style for matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Dataset configuration
DATASET_NAME = "fka/awesome-chatgpt-prompts"


def load_prompts_dataset() -> pd.DataFrame:
    """
    Load the awesome-chatgpt-prompts dataset from HuggingFace.
    
    Returns:
        DataFrame with the prompts data
    """
    dataset = load_dataset(DATASET_NAME, split="train")
    df = dataset.to_pandas()
    
    # Add computed columns for analysis
    df['prompt_length'] = df['prompt'].apply(len)
    df['word_count'] = df['prompt'].apply(lambda x: len(x.split()))
    df['sentence_count'] = df['prompt'].apply(lambda x: len(re.split(r'[.!?]+', x)))
    
    return df


def create_prompt_length_histogram() -> go.Figure:
    """
    Create an interactive histogram of prompt lengths.
    
    Returns:
        Plotly Figure object
    """
    df = load_prompts_dataset()
    
    fig = px.histogram(
        df,
        x='prompt_length',
        nbins=50,
        title='📏 Distribution of Prompt Lengths (Characters)',
        labels={'prompt_length': 'Prompt Length (characters)', 'count': 'Frequency'},
        color_discrete_sequence=['#667eea']
    )
    
    fig.update_layout(
        template='plotly_dark',
        title_font_size=20,
        title_x=0.5,
        showlegend=False,
        xaxis_title="Character Count",
        yaxis_title="Number of Prompts"
    )
    
    # Add mean line
    mean_length = df['prompt_length'].mean()
    fig.add_vline(
        x=mean_length,
        line_dash="dash",
        line_color="#ff6b6b",
        annotation_text=f"Mean: {mean_length:.0f}",
        annotation_position="top"
    )
    
    return fig


def create_word_count_boxplot() -> go.Figure:
    """
    Create a boxplot showing word count distribution.
    
    Returns:
        Plotly Figure object
    """
    df = load_prompts_dataset()
    
    fig = go.Figure()
    
    fig.add_trace(go.Box(
        y=df['word_count'],
        name='Word Count',
        marker_color='#764ba2',
        boxmean='sd'
    ))
    
    fig.update_layout(
        title='📊 Word Count Distribution',
        template='plotly_dark',
        title_font_size=20,
        title_x=0.5,
        yaxis_title="Words per Prompt",
        showlegend=False
    )
    
    return fig


def create_top_words_chart(top_n: int = 20) -> go.Figure:
    """
    Create a bar chart of the most common words in prompts.
    
    Args:
        top_n: Number of top words to display
        
    Returns:
        Plotly Figure object
    """
    df = load_prompts_dataset()
    
    # Tokenize and count words (excluding common stop words)
    stop_words = {
        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
        'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
        'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
        'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
        'used', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which',
        'who', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both',
        'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
        'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'also',
        'your', 'my', 'this', 'that', 'these', 'those', 'me', 'him', 'her',
        'us', 'them', 'if', 'then', 'else', 'while', 'about', 'into', 'through',
        'during', 'before', 'after', 'above', 'below', 'between', 'any', 'their'
    }
    
    all_words = []
    for prompt in df['prompt']:
        words = re.findall(r'\b[a-zA-Z]+\b', prompt.lower())
        all_words.extend([w for w in words if w not in stop_words and len(w) > 2])
    
    word_counts = Counter(all_words).most_common(top_n)
    words, counts = zip(*word_counts)
    
    fig = go.Figure(go.Bar(
        x=list(counts)[::-1],
        y=list(words)[::-1],
        orientation='h',
        marker=dict(
            color=list(range(top_n)),
            colorscale='Viridis'
        )
    ))
    
    fig.update_layout(
        title=f'🔤 Top {top_n} Most Common Words',
        template='plotly_dark',
        title_font_size=20,
        title_x=0.5,
        xaxis_title="Frequency",
        yaxis_title="Word",
        height=600
    )
    
    return fig


def create_length_vs_words_scatter() -> go.Figure:
    """
    Create a scatter plot showing relationship between length and word count.
    
    Returns:
        Plotly Figure object
    """
    df = load_prompts_dataset()
    
    fig = px.scatter(
        df,
        x='word_count',
        y='prompt_length',
        title='📈 Prompt Length vs Word Count',
        labels={
            'word_count': 'Word Count',
            'prompt_length': 'Character Length'
        },
        color='sentence_count',
        color_continuous_scale='Plasma',
        hover_data=['act']
    )
    
    fig.update_layout(
        template='plotly_dark',
        title_font_size=20,
        title_x=0.5
    )
    
    return fig


def create_summary_stats() -> pd.DataFrame:
    """
    Create a summary statistics table for the dataset.
    
    Returns:
        DataFrame with summary statistics
    """
    df = load_prompts_dataset()
    
    stats = {
        'Metric': [
            'Total Prompts',
            'Average Length (chars)',
            'Average Word Count',
            'Max Length (chars)',
            'Min Length (chars)',
            'Median Word Count'
        ],
        'Value': [
            len(df),
            f"{df['prompt_length'].mean():.0f}",
            f"{df['word_count'].mean():.1f}",
            df['prompt_length'].max(),
            df['prompt_length'].min(),
            f"{df['word_count'].median():.0f}"
        ]
    }
    
    return pd.DataFrame(stats)


def create_category_distribution() -> go.Figure:
    """
    Create a pie chart showing distribution of prompt categories (acts).
    
    Returns:
        Plotly Figure object
    """
    df = load_prompts_dataset()
    
    # Get top 10 categories
    top_acts = df['act'].value_counts().head(10)
    
    fig = go.Figure(go.Pie(
        labels=top_acts.index,
        values=top_acts.values,
        hole=0.4,
        marker=dict(colors=px.colors.qualitative.Set3)
    ))
    
    fig.update_layout(
        title='🎭 Top 10 Prompt Categories (Acts)',
        template='plotly_dark',
        title_font_size=20,
        title_x=0.5
    )
    
    return fig


def get_all_eda_figures() -> dict:
    """
    Generate all EDA figures at once.
    
    Returns:
        Dictionary containing all figure objects
    """
    return {
        'prompt_length': create_prompt_length_histogram(),
        'word_count': create_word_count_boxplot(),
        'top_words': create_top_words_chart(),
        'scatter': create_length_vs_words_scatter(),
        'categories': create_category_distribution()
    }