EDA-Generator / eda.py
ohmp's picture
Upload folder using huggingface_hub
043c791 verified
"""
Exploratory Data Analysis (EDA) module for the awesome-chatgpt-prompts dataset.
Generates visualizations for dataset exploration.
"""
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from datasets import load_dataset
from collections import Counter
import re
# Set style for matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
# Dataset configuration
DATASET_NAME = "fka/awesome-chatgpt-prompts"
def load_prompts_dataset() -> pd.DataFrame:
"""
Load the awesome-chatgpt-prompts dataset from HuggingFace.
Returns:
DataFrame with the prompts data
"""
dataset = load_dataset(DATASET_NAME, split="train")
df = dataset.to_pandas()
# Add computed columns for analysis
df['prompt_length'] = df['prompt'].apply(len)
df['word_count'] = df['prompt'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['prompt'].apply(lambda x: len(re.split(r'[.!?]+', x)))
return df
def create_prompt_length_histogram() -> go.Figure:
"""
Create an interactive histogram of prompt lengths.
Returns:
Plotly Figure object
"""
df = load_prompts_dataset()
fig = px.histogram(
df,
x='prompt_length',
nbins=50,
title='πŸ“ Distribution of Prompt Lengths (Characters)',
labels={'prompt_length': 'Prompt Length (characters)', 'count': 'Frequency'},
color_discrete_sequence=['#667eea']
)
fig.update_layout(
template='plotly_dark',
title_font_size=20,
title_x=0.5,
showlegend=False,
xaxis_title="Character Count",
yaxis_title="Number of Prompts"
)
# Add mean line
mean_length = df['prompt_length'].mean()
fig.add_vline(
x=mean_length,
line_dash="dash",
line_color="#ff6b6b",
annotation_text=f"Mean: {mean_length:.0f}",
annotation_position="top"
)
return fig
def create_word_count_boxplot() -> go.Figure:
"""
Create a boxplot showing word count distribution.
Returns:
Plotly Figure object
"""
df = load_prompts_dataset()
fig = go.Figure()
fig.add_trace(go.Box(
y=df['word_count'],
name='Word Count',
marker_color='#764ba2',
boxmean='sd'
))
fig.update_layout(
title='πŸ“Š Word Count Distribution',
template='plotly_dark',
title_font_size=20,
title_x=0.5,
yaxis_title="Words per Prompt",
showlegend=False
)
return fig
def create_top_words_chart(top_n: int = 20) -> go.Figure:
"""
Create a bar chart of the most common words in prompts.
Args:
top_n: Number of top words to display
Returns:
Plotly Figure object
"""
df = load_prompts_dataset()
# Tokenize and count words (excluding common stop words)
stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
'used', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which',
'who', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both',
'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'also',
'your', 'my', 'this', 'that', 'these', 'those', 'me', 'him', 'her',
'us', 'them', 'if', 'then', 'else', 'while', 'about', 'into', 'through',
'during', 'before', 'after', 'above', 'below', 'between', 'any', 'their'
}
all_words = []
for prompt in df['prompt']:
words = re.findall(r'\b[a-zA-Z]+\b', prompt.lower())
all_words.extend([w for w in words if w not in stop_words and len(w) > 2])
word_counts = Counter(all_words).most_common(top_n)
words, counts = zip(*word_counts)
fig = go.Figure(go.Bar(
x=list(counts)[::-1],
y=list(words)[::-1],
orientation='h',
marker=dict(
color=list(range(top_n)),
colorscale='Viridis'
)
))
fig.update_layout(
title=f'πŸ”€ Top {top_n} Most Common Words',
template='plotly_dark',
title_font_size=20,
title_x=0.5,
xaxis_title="Frequency",
yaxis_title="Word",
height=600
)
return fig
def create_length_vs_words_scatter() -> go.Figure:
"""
Create a scatter plot showing relationship between length and word count.
Returns:
Plotly Figure object
"""
df = load_prompts_dataset()
fig = px.scatter(
df,
x='word_count',
y='prompt_length',
title='πŸ“ˆ Prompt Length vs Word Count',
labels={
'word_count': 'Word Count',
'prompt_length': 'Character Length'
},
color='sentence_count',
color_continuous_scale='Plasma',
hover_data=['act']
)
fig.update_layout(
template='plotly_dark',
title_font_size=20,
title_x=0.5
)
return fig
def create_summary_stats() -> pd.DataFrame:
"""
Create a summary statistics table for the dataset.
Returns:
DataFrame with summary statistics
"""
df = load_prompts_dataset()
stats = {
'Metric': [
'Total Prompts',
'Average Length (chars)',
'Average Word Count',
'Max Length (chars)',
'Min Length (chars)',
'Median Word Count'
],
'Value': [
len(df),
f"{df['prompt_length'].mean():.0f}",
f"{df['word_count'].mean():.1f}",
df['prompt_length'].max(),
df['prompt_length'].min(),
f"{df['word_count'].median():.0f}"
]
}
return pd.DataFrame(stats)
def create_category_distribution() -> go.Figure:
"""
Create a pie chart showing distribution of prompt categories (acts).
Returns:
Plotly Figure object
"""
df = load_prompts_dataset()
# Get top 10 categories
top_acts = df['act'].value_counts().head(10)
fig = go.Figure(go.Pie(
labels=top_acts.index,
values=top_acts.values,
hole=0.4,
marker=dict(colors=px.colors.qualitative.Set3)
))
fig.update_layout(
title='🎭 Top 10 Prompt Categories (Acts)',
template='plotly_dark',
title_font_size=20,
title_x=0.5
)
return fig
def get_all_eda_figures() -> dict:
"""
Generate all EDA figures at once.
Returns:
Dictionary containing all figure objects
"""
return {
'prompt_length': create_prompt_length_histogram(),
'word_count': create_word_count_boxplot(),
'top_words': create_top_words_chart(),
'scatter': create_length_vs_words_scatter(),
'categories': create_category_distribution()
}