Spaces:

ohmp
/

EDA-Generator

Runtime error

App Files Files Community

EDA-Generator / eda.py

ohmp

Upload folder using huggingface_hub

043c791 verified 16 days ago

raw

history blame contribute delete

7.35 kB

	"""
	Exploratory Data Analysis (EDA) module for the awesome-chatgpt-prompts dataset.
	Generates visualizations for dataset exploration.
	"""

	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import pandas as pd
	import numpy as np
	from datasets import load_dataset
	from collections import Counter
	import re

	# Set style for matplotlib
	plt.style.use('seaborn-v0_8-darkgrid')
	sns.set_palette("husl")

	# Dataset configuration
	DATASET_NAME = "fka/awesome-chatgpt-prompts"


	def load_prompts_dataset() -> pd.DataFrame:
	"""
	Load the awesome-chatgpt-prompts dataset from HuggingFace.

	Returns:
	DataFrame with the prompts data
	"""
	dataset = load_dataset(DATASET_NAME, split="train")
	df = dataset.to_pandas()

	# Add computed columns for analysis
	df['prompt_length'] = df['prompt'].apply(len)
	df['word_count'] = df['prompt'].apply(lambda x: len(x.split()))
	df['sentence_count'] = df['prompt'].apply(lambda x: len(re.split(r'[.!?]+', x)))

	return df


	def create_prompt_length_histogram() -> go.Figure:
	"""
	Create an interactive histogram of prompt lengths.

	Returns:
	Plotly Figure object
	"""
	df = load_prompts_dataset()

	fig = px.histogram(
	df,
	x='prompt_length',
	nbins=50,
	title='📏 Distribution of Prompt Lengths (Characters)',
	labels={'prompt_length': 'Prompt Length (characters)', 'count': 'Frequency'},
	color_discrete_sequence=['#667eea']
	)

	fig.update_layout(
	template='plotly_dark',
	title_font_size=20,
	title_x=0.5,
	showlegend=False,
	xaxis_title="Character Count",
	yaxis_title="Number of Prompts"
	)

	# Add mean line
	mean_length = df['prompt_length'].mean()
	fig.add_vline(
	x=mean_length,
	line_dash="dash",
	line_color="#ff6b6b",
	annotation_text=f"Mean: {mean_length:.0f}",
	annotation_position="top"
	)

	return fig


	def create_word_count_boxplot() -> go.Figure:
	"""
	Create a boxplot showing word count distribution.

	Returns:
	Plotly Figure object
	"""
	df = load_prompts_dataset()

	fig = go.Figure()

	fig.add_trace(go.Box(
	y=df['word_count'],
	name='Word Count',
	marker_color='#764ba2',
	boxmean='sd'
	))

	fig.update_layout(
	title='📊 Word Count Distribution',
	template='plotly_dark',
	title_font_size=20,
	title_x=0.5,
	yaxis_title="Words per Prompt",
	showlegend=False
	)

	return fig


	def create_top_words_chart(top_n: int = 20) -> go.Figure:
	"""
	Create a bar chart of the most common words in prompts.

	Args:
	top_n: Number of top words to display

	Returns:
	Plotly Figure object
	"""
	df = load_prompts_dataset()

	# Tokenize and count words (excluding common stop words)
	stop_words = {
	'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
	'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
	'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
	'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
	'used', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which',
	'who', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both',
	'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
	'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'also',
	'your', 'my', 'this', 'that', 'these', 'those', 'me', 'him', 'her',
	'us', 'them', 'if', 'then', 'else', 'while', 'about', 'into', 'through',
	'during', 'before', 'after', 'above', 'below', 'between', 'any', 'their'
	}

	all_words = []
	for prompt in df['prompt']:
	words = re.findall(r'\b[a-zA-Z]+\b', prompt.lower())
	all_words.extend([w for w in words if w not in stop_words and len(w) > 2])

	word_counts = Counter(all_words).most_common(top_n)
	words, counts = zip(*word_counts)

	fig = go.Figure(go.Bar(
	x=list(counts)[::-1],
	y=list(words)[::-1],
	orientation='h',
	marker=dict(
	color=list(range(top_n)),
	colorscale='Viridis'
	)
	))

	fig.update_layout(
	title=f'🔤 Top {top_n} Most Common Words',
	template='plotly_dark',
	title_font_size=20,
	title_x=0.5,
	xaxis_title="Frequency",
	yaxis_title="Word",
	height=600
	)

	return fig


	def create_length_vs_words_scatter() -> go.Figure:
	"""
	Create a scatter plot showing relationship between length and word count.

	Returns:
	Plotly Figure object
	"""
	df = load_prompts_dataset()

	fig = px.scatter(
	df,
	x='word_count',
	y='prompt_length',
	title='📈 Prompt Length vs Word Count',
	labels={
	'word_count': 'Word Count',
	'prompt_length': 'Character Length'
	},
	color='sentence_count',
	color_continuous_scale='Plasma',
	hover_data=['act']
	)

	fig.update_layout(
	template='plotly_dark',
	title_font_size=20,
	title_x=0.5
	)

	return fig


	def create_summary_stats() -> pd.DataFrame:
	"""
	Create a summary statistics table for the dataset.

	Returns:
	DataFrame with summary statistics
	"""
	df = load_prompts_dataset()

	stats = {
	'Metric': [
	'Total Prompts',
	'Average Length (chars)',
	'Average Word Count',
	'Max Length (chars)',
	'Min Length (chars)',
	'Median Word Count'
	],
	'Value': [
	len(df),
	f"{df['prompt_length'].mean():.0f}",
	f"{df['word_count'].mean():.1f}",
	df['prompt_length'].max(),
	df['prompt_length'].min(),
	f"{df['word_count'].median():.0f}"
	]
	}

	return pd.DataFrame(stats)


	def create_category_distribution() -> go.Figure:
	"""
	Create a pie chart showing distribution of prompt categories (acts).

	Returns:
	Plotly Figure object
	"""
	df = load_prompts_dataset()

	# Get top 10 categories
	top_acts = df['act'].value_counts().head(10)

	fig = go.Figure(go.Pie(
	labels=top_acts.index,
	values=top_acts.values,
	hole=0.4,
	marker=dict(colors=px.colors.qualitative.Set3)
	))

	fig.update_layout(
	title='🎭 Top 10 Prompt Categories (Acts)',
	template='plotly_dark',
	title_font_size=20,
	title_x=0.5
	)

	return fig


	def get_all_eda_figures() -> dict:
	"""
	Generate all EDA figures at once.

	Returns:
	Dictionary containing all figure objects
	"""
	return {
	'prompt_length': create_prompt_length_histogram(),
	'word_count': create_word_count_boxplot(),
	'top_words': create_top_words_chart(),
	'scatter': create_length_vs_words_scatter(),
	'categories': create_category_distribution()
	}