Spaces:

TroglodyteDerivations
/

Rick_and_Morty_Transcript_Analysis

Sleeping

App Files Files Community

Rick_and_Morty_Transcript_Analysis / app.py

TroglodyteDerivations

Upload 32 files

e3e7844 verified 3 months ago

raw

history blame contribute delete

20.7 kB

	import gradio as gr
	import polars as pl
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np
	from wordcloud import WordCloud
	import textwrap
	import io
	import re
	import base64

	# Set up styling
	plt.style.use('seaborn-v0_8-whitegrid')
	sns.set_palette("husl")

	# Load and prepare data
	def load_data():
	df = pl.read_csv('Rick-n-Morty.csv').rename({
	'': 'line_id', 'episode no.': 'episode_no',
	'speaker': 'character', 'dialouge': 'dialogue'
	})

	def clean_text(text):
	if text is None: return ""
	import re
	text = re.sub(r'[^\w\s\.\!\?\,]', '', str(text))
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	df = df.with_columns([
	pl.col('dialogue').map_elements(clean_text, return_dtype=pl.Utf8).alias('cleaned_dialogue')
	]).filter(pl.col('cleaned_dialogue').str.len_chars() > 0)

	df = df.with_columns([
	pl.col('cleaned_dialogue').str.len_chars().alias('dialogue_length'),
	pl.col('cleaned_dialogue').str.contains(r'!+').alias('has_exclamation'),
	pl.col('cleaned_dialogue').str.contains(r'\?+').alias('has_question'),
	pl.col('cleaned_dialogue').str.split(' ').list.len().alias('word_count')
	])

	return df

	df = load_data()

	# Analysis functions
	def plot_to_base64(fig):
	"""Convert matplotlib figure to base64 for Gradio"""
	buf = io.BytesIO()
	fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	img_str = base64.b64encode(buf.read()).decode('utf-8')
	plt.close(fig)
	return f"data:image/png;base64,{img_str}"

	def create_overview_dashboard():
	"""Create comprehensive overview dashboard"""
	fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

	# Plot 1: Character dominance
	top_chars = df.group_by('character').agg(pl.len().alias('lines')).sort('lines', descending=True).head(10)
	ax1.barh(top_chars['character'].to_list(), top_chars['lines'].to_list())
	ax1.set_title('Top 10 Characters by Lines', fontweight='bold')
	ax1.set_xlabel('Number of Lines')

	# Plot 2: Episode line distribution
	episode_lines = df.group_by('episode_no').agg(pl.len().alias('lines')).sort('episode_no')
	ax2.plot(episode_lines['episode_no'].to_list(), episode_lines['lines'].to_list(), 'o-')
	ax2.set_title('Lines per Episode', fontweight='bold')
	ax2.set_xlabel('Episode Number')
	ax2.set_ylabel('Total Lines')
	ax2.grid(True, alpha=0.3)

	# Plot 3: Dialogue length distribution
	ax3.hist(df['dialogue_length'].to_list(), bins=50, alpha=0.7, edgecolor='black')
	ax3.set_title('Dialogue Length Distribution', fontweight='bold')
	ax3.set_xlabel('Characters per Line')
	ax3.set_ylabel('Frequency')

	# Plot 4: Emotional content
	emotional_data = df.group_by('character').agg([
	pl.len().alias('total_lines'),
	pl.col('has_exclamation').sum().alias('exclamations'),
	pl.col('has_question').sum().alias('questions')
	]).filter(pl.col('total_lines') > 50).head(8)

	x = np.arange(len(emotional_data))
	width = 0.35
	ax4.bar(x - width/2, emotional_data['exclamations'].to_list(), width, label='Exclamations')
	ax4.bar(x + width/2, emotional_data['questions'].to_list(), width, label='Questions')
	ax4.set_title('Emotional Expression - Top Characters', fontweight='bold')
	ax4.set_xticks(x)
	ax4.set_xticklabels(emotional_data['character'].to_list(), rotation=45)
	ax4.legend()

	plt.tight_layout()
	return plot_to_base64(fig)

	def create_episode_insights():
	"""Create episode insights visualization"""
	fig = plt.figure(figsize=(16, 10))

	# Key episodes analysis
	key_episodes = [6, 7, 12, 30]
	episode_data = df.filter(pl.col('episode_no').is_in(key_episodes))
	episode_stats = episode_data.group_by('episode_no').agg([
	pl.len().alias('total_lines'),
	pl.col('dialogue_length').mean().alias('avg_length'),
	pl.col('character').n_unique().alias('unique_chars')
	]).sort('episode_no')

	# Plot layout
	gs = fig.add_gridspec(2, 3)

	# Plot 1: Comparative metrics
	ax1 = fig.add_subplot(gs[0, 0])
	metrics = ['Lines', 'Characters', 'Avg Length']
	ep6_vals = [74, 20, 90.2]
	ep7_vals = [170, 15, 33.4]
	ep12_vals = [338, 96, 93.6]
	ep30_vals = [859, 38, 75.3]

	x = np.arange(len(metrics))
	width = 0.2

	ax1.bar(x - width*1.5, ep6_vals, width, label='Ep 6: Monologue', alpha=0.8)
	ax1.bar(x - width*0.5, ep7_vals, width, label='Ep 7: Concise', alpha=0.8)
	ax1.bar(x + width*0.5, ep12_vals, width, label='Ep 12: Ensemble', alpha=0.8)
	ax1.bar(x + width*1.5, ep30_vals, width, label='Ep 30: Dense', alpha=0.8)

	ax1.set_title('Key Episode Comparison', fontweight='bold')
	ax1.set_xticks(x)
	ax1.set_xticklabels(metrics)
	ax1.legend()
	ax1.grid(axis='y', alpha=0.3)

	# Plot 2: Episode 12 character distribution
	ax2 = fig.add_subplot(gs[0, 1])
	ep12 = df.filter(pl.col('episode_no') == 12)
	char_dist = ep12.group_by('character').agg(pl.len().alias('lines'))
	line_ranges = ['1 line', '2-5 lines', '6-10 lines', '11+ lines']
	counts = [
	char_dist.filter(pl.col('lines') == 1).height,
	char_dist.filter((pl.col('lines') >= 2) & (pl.col('lines') <= 5)).height,
	char_dist.filter((pl.col('lines') >= 6) & (pl.col('lines') <= 10)).height,
	char_dist.filter(pl.col('lines') >= 11).height
	]

	ax2.bar(line_ranges, counts, color=['#FF9999', '#FF6B6B', '#CC4455', '#990033'])
	ax2.set_title('Episode 12: Character Distribution\n(96 Unique Characters!)', fontweight='bold')
	ax2.set_ylabel('Number of Characters')

	for i, count in enumerate(counts):
	ax2.text(i, count + 0.5, str(count), ha='center', va='bottom', fontweight='bold')

	# Plot 3: Dialogue length comparison
	ax3 = fig.add_subplot(gs[0, 2])
	episode_lengths = [
	df.filter(pl.col('episode_no') == 6)['dialogue_length'].to_list(),
	df.filter(pl.col('episode_no') == 7)['dialogue_length'].to_list(),
	df.filter(pl.col('episode_no') == 12)['dialogue_length'].to_list(),
	df.filter(pl.col('episode_no') == 30)['dialogue_length'].to_list()
	]

	ax3.boxplot(episode_lengths, labels=['Ep 6\nMonologue', 'Ep 7\nConcise', 'Ep 12\nEnsemble', 'Ep 30\nDense'])
	ax3.set_title('Dialogue Length Distribution', fontweight='bold')
	ax3.set_ylabel('Characters per Line')

	# Plot 4: Rick's longest monologue
	ax4 = fig.add_subplot(gs[1, :])
	ax4.axis('off')

	ep30 = df.filter(pl.col('episode_no') == 30)
	rick_longest = ep30.filter(pl.col('character') == 'Rick').sort('dialogue_length', descending=True).head(1)

	monologue_text = "RICK'S EPIC MONOLOGUE (Episode 30 - 865 characters):\n\n"
	monologue_text += textwrap.fill(rick_longest['cleaned_dialogue'][0][:300] + "...", width=80)

	ax4.text(0.02, 0.98, monologue_text, transform=ax4.transAxes, fontsize=10,
	verticalalignment='top', fontfamily='monospace',
	bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))

	plt.tight_layout()
	return plot_to_base64(fig)

	def create_character_analysis(character_name):
	"""Create detailed character analysis"""
	character_data = df.filter(pl.col('character') == character_name)

	if character_data.height == 0:
	return "Character not found in dataset.", ""

	fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

	# Basic stats
	total_lines = character_data.height
	avg_length = character_data['dialogue_length'].mean()
	total_chars = character_data['dialogue_length'].sum()
	exclamation_rate = (character_data['has_exclamation'].sum() / total_lines) * 100
	question_rate = (character_data['has_question'].sum() / total_lines) * 100

	# Plot 1: Episode appearance
	episode_appearances = character_data.group_by('episode_no').agg(pl.len().alias('lines')).sort('episode_no')
	ax1.bar(episode_appearances['episode_no'].to_list(), episode_appearances['lines'].to_list())
	ax1.set_title(f'{character_name} - Lines per Episode', fontweight='bold')
	ax1.set_xlabel('Episode Number')
	ax1.set_ylabel('Lines')

	# Plot 2: Dialogue length distribution
	ax2.hist(character_data['dialogue_length'].to_list(), bins=20, alpha=0.7, edgecolor='black')
	ax2.set_title(f'{character_name} - Dialogue Length Distribution', fontweight='bold')
	ax2.set_xlabel('Characters per Line')
	ax2.set_ylabel('Frequency')
	ax2.axvline(avg_length, color='red', linestyle='--', label=f'Average: {avg_length:.1f} chars')
	ax2.legend()

	# Plot 3: Emotional expression
	emotional_data = [exclamation_rate, question_rate, 100 - exclamation_rate - question_rate]
	emotional_labels = ['Exclamations', 'Questions', 'Neutral']
	ax3.pie(emotional_data, labels=emotional_labels, autopct='%1.1f%%', startangle=90)
	ax3.set_title(f'{character_name} - Emotional Expression', fontweight='bold')

	# Plot 4: Word cloud
	ax4.axis('off')
	all_text = ' '.join(character_data['cleaned_dialogue'].to_list())
	if all_text.strip():
	wordcloud = WordCloud(width=400, height=200, background_color='white').generate(all_text)
	ax4.imshow(wordcloud, interpolation='bilinear')
	ax4.set_title(f'{character_name} - Common Words', fontweight='bold')

	plt.tight_layout()

	# Character summary
	summary = f"""
	{character_name} Character Analysis:

	• Total Lines: {total_lines}
	• Average Line Length: {avg_length:.1f} characters
	• Total Characters Spoken: {total_chars:,}
	• Exclamation Rate: {exclamation_rate:.1f}%
	• Question Rate: {question_rate:.1f}%
	• Episodes Appeared: {character_data['episode_no'].n_unique()}

	Longest Dialogue:
	{textwrap.fill(character_data.sort('dialogue_length', descending=True)['cleaned_dialogue'][0][:200] + '...', width=60)}
	"""

	return summary, plot_to_base64(fig)

	def create_episode_25_analysis():
	"""Create Episode 25 anomaly analysis"""
	ep25 = df.filter(pl.col('episode_no') == 25)

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

	# Plot 1: Episode 25 content breakdown
	content_types = ['Stage Directions', 'Actual Dialogue']
	counts = [15, 1] # From our analysis

	ax1.bar(content_types, counts, color=['#FF6B6B', '#4ECDC4'])
	ax1.set_title('Episode 25: Content Type Breakdown\n(Data Anomaly)', fontweight='bold')
	ax1.set_ylabel('Number of Lines')
	for i, count in enumerate(counts):
	ax1.text(i, count + 0.1, str(count), ha='center', va='bottom', fontweight='bold')

	# Plot 2: Comparison with normal episodes
	ep24 = df.filter(pl.col('episode_no') == 24)
	ep26 = df.filter(pl.col('episode_no') == 26)

	comparison_data = [
	ep24['dialogue_length'].mean(),
	ep25['dialogue_length'].mean(),
	ep26['dialogue_length'].mean()
	]

	ax2.bar(['Episode 24', 'Episode 25\n(Anomaly)', 'Episode 26'], comparison_data,
	color=['#45B7D1', '#FF6B6B', '#4ECDC4'])
	ax2.set_title('Average Dialogue Length Comparison', fontweight='bold')
	ax2.set_ylabel('Average Characters per Line')

	plt.tight_layout()

	analysis_text = f"""
	Episode 25 Anomaly Discovery:

	🚨 Critical Finding: Episode 25 is not a normal dialogue episode!

	• Total Lines: {ep25.height}
	• Stage Directions: 15 lines (93.8%)
	• Actual Character Dialogue: 1 line (6.2%)
	• Emotional Markers: 0 exclamations, 0 questions

	Explanation: This episode consists primarily of narrative stage directions
	and scene descriptions rather than character dialogue, explaining the complete
	absence of emotional expression markers.

	Impact: Episode 25 should be excluded from character and emotional analysis
	as it represents a different data format (montage/recap episode).
	"""

	return analysis_text, plot_to_base64(fig)

	def create_word_analysis():
	"""Create word frequency and sentiment analysis"""
	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

	# Word frequency analysis
	all_text = ' '.join(df['cleaned_dialogue'].to_list())
	words = re.findall(r'\b\w+\b', all_text.lower())
	word_freq = pl.DataFrame({'word': words}).group_by('word').agg(
	pl.len().alias('frequency')
	).filter(
	~pl.col('word').is_in(['the', 'and', 'to', 'a', 'i', 'you', 'it', 'that', 'is', 'this', 'of', 'in', 'for'])
	).sort('frequency', descending=True).head(15)

	ax1.barh(word_freq['word'].to_list(), word_freq['frequency'].to_list())
	ax1.set_title('Top 15 Most Frequent Words\n(Excluding Common Words)', fontweight='bold')
	ax1.set_xlabel('Frequency')

	# Emotional content over time
	emotional_by_episode = df.group_by('episode_no').agg([
	(pl.col('has_exclamation').sum() / pl.len() * 100).alias('exclamation_pct'),
	(pl.col('has_question').sum() / pl.len() * 100).alias('question_pct')
	]).sort('episode_no')

	ax2.plot(emotional_by_episode['episode_no'].to_list(),
	emotional_by_episode['exclamation_pct'].to_list(),
	'o-', label='Exclamations', linewidth=2)
	ax2.plot(emotional_by_episode['episode_no'].to_list(),
	emotional_by_episode['question_pct'].to_list(),
	'o-', label='Questions', linewidth=2)
	ax2.set_title('Emotional Expression Over Time', fontweight='bold')
	ax2.set_xlabel('Episode Number')
	ax2.set_ylabel('Percentage of Lines (%)')
	ax2.legend()
	ax2.grid(True, alpha=0.3)

	plt.tight_layout()

	analysis_text = """
	Linguistic Analysis Insights:

	• Common Vocabulary: Analysis reveals the most frequently used words beyond common articles
	• Emotional Trends: Tracking how emotional expression (exclamations/questions) varies across episodes
	• Narrative Patterns: Identifying recurring linguistic themes and character speech patterns

	The word frequency analysis helps understand the core vocabulary of the series,
	while emotional tracking shows how the tone evolves throughout different episodes.
	"""

	return analysis_text, plot_to_base64(fig)

	# Gradio Interface
	with gr.Blocks(theme=gr.themes.Soft(), title="Rick and Morty Transcript Analysis") as demo:
	gr.Markdown("# 🎬 Rick and Morty Transcript Analysis")
	gr.Markdown("### Comprehensive analysis of the Hugging Face Dataset: Prarabdha/Rick_and_Morty_Transcript")
	gr.Markdown("Explore character dynamics, episode structures, and storytelling patterns across the entire series!")

	with gr.Tab("📊 Overview Dashboard"):
	gr.Markdown("## Dataset Overview and Key Metrics")
	overview_btn = gr.Button("Generate Overview Dashboard")
	overview_output = gr.HTML()

	@overview_btn.click(inputs=[], outputs=[overview_output])
	def update_overview():
	img_data = create_overview_dashboard()
	return f'<img src="{img_data}" style="max-width:100%; height:auto;">'

	with gr.Tab("🎭 Episode Insights"):
	gr.Markdown("## Deep Dive into Key Episodes")
	gr.Markdown("""
	Featured Episodes Analysis:
	- Episode 30: Most talkative (859 lines)
	- Episode 12: Character-rich (96 unique characters!)
	- Episode 6: Long dialogues (90.2 avg length)
	- Episode 7: Short dialogues (33.4 avg length)
	""")
	insights_btn = gr.Button("Generate Episode Insights")
	insights_output = gr.HTML()

	@insights_btn.click(inputs=[], outputs=[insights_output])
	def update_insights():
	img_data = create_episode_insights()
	return f'<img src="{img_data}" style="max-width:100%; height:auto;">'

	with gr.Tab("🔍 Character Analysis"):
	gr.Markdown("## Detailed Character Analysis")
	character_input = gr.Dropdown(
	choices=df['character'].unique().sort().to_list(),
	label="Select Character",
	value="Rick"
	)
	character_btn = gr.Button("Analyze Character")
	character_summary = gr.Markdown()
	character_viz = gr.HTML()

	@character_btn.click(inputs=[character_input], outputs=[character_summary, character_viz])
	def update_character(character_name):
	summary, img_data = create_character_analysis(character_name)
	viz_html = f'<img src="{img_data}" style="max-width:100%; height:auto;">' if img_data else ""
	return summary, viz_html

	with gr.Tab("🚨 Episode 25 Anomaly"):
	gr.Markdown("## Episode 25 Data Anomaly Discovery")
	anomaly_btn = gr.Button("Analyze Episode 25 Anomaly")
	anomaly_summary = gr.Markdown()
	anomaly_viz = gr.HTML()

	@anomaly_btn.click(inputs=[], outputs=[anomaly_summary, anomaly_viz])
	def update_anomaly():
	summary, img_data = create_episode_25_analysis()
	viz_html = f'<img src="{img_data}" style="max-width:100%; height:auto;">'
	return summary, viz_html

	with gr.Tab("📝 Word Analysis"):
	gr.Markdown("## Linguistic and Emotional Analysis")
	word_btn = gr.Button("Generate Word Analysis")
	word_summary = gr.Markdown()
	word_viz = gr.HTML()

	@word_btn.click(inputs=[], outputs=[word_summary, word_viz])
	def update_word_analysis():
	summary, img_data = create_word_analysis()
	viz_html = f'<img src="{img_data}" style="max-width:100%; height:auto;">'
	return summary, viz_html

	with gr.Tab("📈 Key Discoveries"):
	gr.Markdown("## Major Research Findings")
	gr.Markdown("""
	### 🎯 Key Discoveries from Our Analysis:

	1. Character Dominance Patterns:
	- Rick dominates with 28.7% of all dialogue
	- Morty follows with 20.1% but shows more emotional expression
	- Top 5 characters account for 73.9% of total lines

	2. Episode Structure Extremes:
	- Episode 30: 859 lines (3.5x series average)
	- Episode 12: 96 unique characters (4.8x series average)
	- Episode 6: 90.2 avg characters per line (1.4x average)
	- Episode 7: 33.4 avg characters per line (0.5x average)

	3. Surprising Character Dynamics:
	- Testicle Monster A has 19 lines in Episode 12 (2nd most!)
	- 53 alternate reality Ricks/Mortys appear in Episode 12
	- 47 characters in Episode 12 have only 1 line

	4. Data Quality Insights:
	- Episode 25 is an anomaly (93.8% stage directions)
	- Complete absence of emotional markers in Episode 25
	- Demonstrates importance of data preprocessing

	5. Storytelling Innovation:
	- 2.7x range in dialogue pacing across episodes
	- Willingness to experiment with extreme narrative structures
	- Balanced character consistency with creative risk-taking
	""")

	with gr.Tab("📋 Dataset Info"):
	gr.Markdown("## Dataset Information")
	gr.Markdown(f"""
	### Hugging Face Dataset: Prarabdha/Rick_and_Morty_Transcript

	Dataset Statistics:
	- Total Episodes: {df['episode_no'].n_unique()}
	- Total Lines: {df.height:,}
	- Unique Characters: {df['character'].n_unique()}
	- Total Dialogue Characters: {df['dialogue_length'].sum():,}
	- Average Line Length: {df['dialogue_length'].mean():.1f} characters

	Data Collection:
	- Source: Rick and Morty animated series transcripts
	- Format: CSV with episode numbers, character names, and dialogue
	- Coverage: Multiple seasons of the series

	Analysis Methodology:
	- Data cleaning and preprocessing with Python Polars
	- Statistical analysis of character and episode patterns
	- Visualization of storytelling structures and trends
	- Identification of data anomalies and quality issues

	Technical Stack:
	- Python Polars for fast data processing
	- Matplotlib & Seaborn for visualizations
	- Gradio for interactive web interface
	- Hugging Face Datasets for data access
	""")

	if __name__ == "__main__":
	demo.launch()