Spaces:

TroglodyteDerivations
/

Rick_and_Morty_Transcript_Analysis

Sleeping

File size: 20,707 Bytes

e3e7844

import gradio as gr
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import textwrap
import io
import re 
import base64

# Set up styling
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Load and prepare data
def load_data():
    df = pl.read_csv('Rick-n-Morty.csv').rename({
        '': 'line_id', 'episode no.': 'episode_no', 
        'speaker': 'character', 'dialouge': 'dialogue'
    })
    
    def clean_text(text):
        if text is None: return ""
        import re
        text = re.sub(r'[^\w\s\.\!\?\,]', '', str(text))
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    df = df.with_columns([
        pl.col('dialogue').map_elements(clean_text, return_dtype=pl.Utf8).alias('cleaned_dialogue')
    ]).filter(pl.col('cleaned_dialogue').str.len_chars() > 0)
    
    df = df.with_columns([
        pl.col('cleaned_dialogue').str.len_chars().alias('dialogue_length'),
        pl.col('cleaned_dialogue').str.contains(r'!+').alias('has_exclamation'),
        pl.col('cleaned_dialogue').str.contains(r'\?+').alias('has_question'),
        pl.col('cleaned_dialogue').str.split(' ').list.len().alias('word_count')
    ])
    
    return df

df = load_data()

# Analysis functions
def plot_to_base64(fig):
    """Convert matplotlib figure to base64 for Gradio"""
    buf = io.BytesIO()
    fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    img_str = base64.b64encode(buf.read()).decode('utf-8')
    plt.close(fig)
    return f"data:image/png;base64,{img_str}"

def create_overview_dashboard():
    """Create comprehensive overview dashboard"""
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # Plot 1: Character dominance
    top_chars = df.group_by('character').agg(pl.len().alias('lines')).sort('lines', descending=True).head(10)
    ax1.barh(top_chars['character'].to_list(), top_chars['lines'].to_list())
    ax1.set_title('Top 10 Characters by Lines', fontweight='bold')
    ax1.set_xlabel('Number of Lines')
    
    # Plot 2: Episode line distribution
    episode_lines = df.group_by('episode_no').agg(pl.len().alias('lines')).sort('episode_no')
    ax2.plot(episode_lines['episode_no'].to_list(), episode_lines['lines'].to_list(), 'o-')
    ax2.set_title('Lines per Episode', fontweight='bold')
    ax2.set_xlabel('Episode Number')
    ax2.set_ylabel('Total Lines')
    ax2.grid(True, alpha=0.3)
    
    # Plot 3: Dialogue length distribution
    ax3.hist(df['dialogue_length'].to_list(), bins=50, alpha=0.7, edgecolor='black')
    ax3.set_title('Dialogue Length Distribution', fontweight='bold')
    ax3.set_xlabel('Characters per Line')
    ax3.set_ylabel('Frequency')
    
    # Plot 4: Emotional content
    emotional_data = df.group_by('character').agg([
        pl.len().alias('total_lines'),
        pl.col('has_exclamation').sum().alias('exclamations'),
        pl.col('has_question').sum().alias('questions')
    ]).filter(pl.col('total_lines') > 50).head(8)
    
    x = np.arange(len(emotional_data))
    width = 0.35
    ax4.bar(x - width/2, emotional_data['exclamations'].to_list(), width, label='Exclamations')
    ax4.bar(x + width/2, emotional_data['questions'].to_list(), width, label='Questions')
    ax4.set_title('Emotional Expression - Top Characters', fontweight='bold')
    ax4.set_xticks(x)
    ax4.set_xticklabels(emotional_data['character'].to_list(), rotation=45)
    ax4.legend()
    
    plt.tight_layout()
    return plot_to_base64(fig)

def create_episode_insights():
    """Create episode insights visualization"""
    fig = plt.figure(figsize=(16, 10))
    
    # Key episodes analysis
    key_episodes = [6, 7, 12, 30]
    episode_data = df.filter(pl.col('episode_no').is_in(key_episodes))
    episode_stats = episode_data.group_by('episode_no').agg([
        pl.len().alias('total_lines'),
        pl.col('dialogue_length').mean().alias('avg_length'),
        pl.col('character').n_unique().alias('unique_chars')
    ]).sort('episode_no')
    
    # Plot layout
    gs = fig.add_gridspec(2, 3)
    
    # Plot 1: Comparative metrics
    ax1 = fig.add_subplot(gs[0, 0])
    metrics = ['Lines', 'Characters', 'Avg Length']
    ep6_vals = [74, 20, 90.2]
    ep7_vals = [170, 15, 33.4]
    ep12_vals = [338, 96, 93.6]
    ep30_vals = [859, 38, 75.3]
    
    x = np.arange(len(metrics))
    width = 0.2
    
    ax1.bar(x - width*1.5, ep6_vals, width, label='Ep 6: Monologue', alpha=0.8)
    ax1.bar(x - width*0.5, ep7_vals, width, label='Ep 7: Concise', alpha=0.8)
    ax1.bar(x + width*0.5, ep12_vals, width, label='Ep 12: Ensemble', alpha=0.8)
    ax1.bar(x + width*1.5, ep30_vals, width, label='Ep 30: Dense', alpha=0.8)
    
    ax1.set_title('Key Episode Comparison', fontweight='bold')
    ax1.set_xticks(x)
    ax1.set_xticklabels(metrics)
    ax1.legend()
    ax1.grid(axis='y', alpha=0.3)
    
    # Plot 2: Episode 12 character distribution
    ax2 = fig.add_subplot(gs[0, 1])
    ep12 = df.filter(pl.col('episode_no') == 12)
    char_dist = ep12.group_by('character').agg(pl.len().alias('lines'))
    line_ranges = ['1 line', '2-5 lines', '6-10 lines', '11+ lines']
    counts = [
        char_dist.filter(pl.col('lines') == 1).height,
        char_dist.filter((pl.col('lines') >= 2) & (pl.col('lines') <= 5)).height,
        char_dist.filter((pl.col('lines') >= 6) & (pl.col('lines') <= 10)).height,
        char_dist.filter(pl.col('lines') >= 11).height
    ]
    
    ax2.bar(line_ranges, counts, color=['#FF9999', '#FF6B6B', '#CC4455', '#990033'])
    ax2.set_title('Episode 12: Character Distribution\n(96 Unique Characters!)', fontweight='bold')
    ax2.set_ylabel('Number of Characters')
    
    for i, count in enumerate(counts):
        ax2.text(i, count + 0.5, str(count), ha='center', va='bottom', fontweight='bold')
    
    # Plot 3: Dialogue length comparison
    ax3 = fig.add_subplot(gs[0, 2])
    episode_lengths = [
        df.filter(pl.col('episode_no') == 6)['dialogue_length'].to_list(),
        df.filter(pl.col('episode_no') == 7)['dialogue_length'].to_list(),
        df.filter(pl.col('episode_no') == 12)['dialogue_length'].to_list(),
        df.filter(pl.col('episode_no') == 30)['dialogue_length'].to_list()
    ]
    
    ax3.boxplot(episode_lengths, labels=['Ep 6\nMonologue', 'Ep 7\nConcise', 'Ep 12\nEnsemble', 'Ep 30\nDense'])
    ax3.set_title('Dialogue Length Distribution', fontweight='bold')
    ax3.set_ylabel('Characters per Line')
    
    # Plot 4: Rick's longest monologue
    ax4 = fig.add_subplot(gs[1, :])
    ax4.axis('off')
    
    ep30 = df.filter(pl.col('episode_no') == 30)
    rick_longest = ep30.filter(pl.col('character') == 'Rick').sort('dialogue_length', descending=True).head(1)
    
    monologue_text = "RICK'S EPIC MONOLOGUE (Episode 30 - 865 characters):\n\n"
    monologue_text += textwrap.fill(rick_longest['cleaned_dialogue'][0][:300] + "...", width=80)
    
    ax4.text(0.02, 0.98, monologue_text, transform=ax4.transAxes, fontsize=10,
             verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))
    
    plt.tight_layout()
    return plot_to_base64(fig)

def create_character_analysis(character_name):
    """Create detailed character analysis"""
    character_data = df.filter(pl.col('character') == character_name)
    
    if character_data.height == 0:
        return "Character not found in dataset.", ""
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
    
    # Basic stats
    total_lines = character_data.height
    avg_length = character_data['dialogue_length'].mean()
    total_chars = character_data['dialogue_length'].sum()
    exclamation_rate = (character_data['has_exclamation'].sum() / total_lines) * 100
    question_rate = (character_data['has_question'].sum() / total_lines) * 100
    
    # Plot 1: Episode appearance
    episode_appearances = character_data.group_by('episode_no').agg(pl.len().alias('lines')).sort('episode_no')
    ax1.bar(episode_appearances['episode_no'].to_list(), episode_appearances['lines'].to_list())
    ax1.set_title(f'{character_name} - Lines per Episode', fontweight='bold')
    ax1.set_xlabel('Episode Number')
    ax1.set_ylabel('Lines')
    
    # Plot 2: Dialogue length distribution
    ax2.hist(character_data['dialogue_length'].to_list(), bins=20, alpha=0.7, edgecolor='black')
    ax2.set_title(f'{character_name} - Dialogue Length Distribution', fontweight='bold')
    ax2.set_xlabel('Characters per Line')
    ax2.set_ylabel('Frequency')
    ax2.axvline(avg_length, color='red', linestyle='--', label=f'Average: {avg_length:.1f} chars')
    ax2.legend()
    
    # Plot 3: Emotional expression
    emotional_data = [exclamation_rate, question_rate, 100 - exclamation_rate - question_rate]
    emotional_labels = ['Exclamations', 'Questions', 'Neutral']
    ax3.pie(emotional_data, labels=emotional_labels, autopct='%1.1f%%', startangle=90)
    ax3.set_title(f'{character_name} - Emotional Expression', fontweight='bold')
    
    # Plot 4: Word cloud
    ax4.axis('off')
    all_text = ' '.join(character_data['cleaned_dialogue'].to_list())
    if all_text.strip():
        wordcloud = WordCloud(width=400, height=200, background_color='white').generate(all_text)
        ax4.imshow(wordcloud, interpolation='bilinear')
        ax4.set_title(f'{character_name} - Common Words', fontweight='bold')
    
    plt.tight_layout()
    
    # Character summary
    summary = f"""
    **{character_name} Character Analysis:**
    
    • **Total Lines**: {total_lines}
    • **Average Line Length**: {avg_length:.1f} characters
    • **Total Characters Spoken**: {total_chars:,}
    • **Exclamation Rate**: {exclamation_rate:.1f}%
    • **Question Rate**: {question_rate:.1f}%
    • **Episodes Appeared**: {character_data['episode_no'].n_unique()}
    
    **Longest Dialogue**: 
    {textwrap.fill(character_data.sort('dialogue_length', descending=True)['cleaned_dialogue'][0][:200] + '...', width=60)}
    """
    
    return summary, plot_to_base64(fig)

def create_episode_25_analysis():
    """Create Episode 25 anomaly analysis"""
    ep25 = df.filter(pl.col('episode_no') == 25)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot 1: Episode 25 content breakdown
    content_types = ['Stage Directions', 'Actual Dialogue']
    counts = [15, 1]  # From our analysis
    
    ax1.bar(content_types, counts, color=['#FF6B6B', '#4ECDC4'])
    ax1.set_title('Episode 25: Content Type Breakdown\n(Data Anomaly)', fontweight='bold')
    ax1.set_ylabel('Number of Lines')
    for i, count in enumerate(counts):
        ax1.text(i, count + 0.1, str(count), ha='center', va='bottom', fontweight='bold')
    
    # Plot 2: Comparison with normal episodes
    ep24 = df.filter(pl.col('episode_no') == 24)
    ep26 = df.filter(pl.col('episode_no') == 26)
    
    comparison_data = [
        ep24['dialogue_length'].mean(),
        ep25['dialogue_length'].mean(),
        ep26['dialogue_length'].mean()
    ]
    
    ax2.bar(['Episode 24', 'Episode 25\n(Anomaly)', 'Episode 26'], comparison_data, 
            color=['#45B7D1', '#FF6B6B', '#4ECDC4'])
    ax2.set_title('Average Dialogue Length Comparison', fontweight='bold')
    ax2.set_ylabel('Average Characters per Line')
    
    plt.tight_layout()
    
    analysis_text = f"""
    **Episode 25 Anomaly Discovery:**
    
    🚨 **Critical Finding**: Episode 25 is not a normal dialogue episode!
    
    • **Total Lines**: {ep25.height}
    • **Stage Directions**: 15 lines (93.8%)
    • **Actual Character Dialogue**: 1 line (6.2%)
    • **Emotional Markers**: 0 exclamations, 0 questions
    
    **Explanation**: This episode consists primarily of narrative stage directions
    and scene descriptions rather than character dialogue, explaining the complete
    absence of emotional expression markers.
    
    **Impact**: Episode 25 should be excluded from character and emotional analysis
    as it represents a different data format (montage/recap episode).
    """
    
    return analysis_text, plot_to_base64(fig)

def create_word_analysis():
    """Create word frequency and sentiment analysis"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Word frequency analysis
    all_text = ' '.join(df['cleaned_dialogue'].to_list())
    words = re.findall(r'\b\w+\b', all_text.lower())
    word_freq = pl.DataFrame({'word': words}).group_by('word').agg(
        pl.len().alias('frequency')
    ).filter(
        ~pl.col('word').is_in(['the', 'and', 'to', 'a', 'i', 'you', 'it', 'that', 'is', 'this', 'of', 'in', 'for'])
    ).sort('frequency', descending=True).head(15)
    
    ax1.barh(word_freq['word'].to_list(), word_freq['frequency'].to_list())
    ax1.set_title('Top 15 Most Frequent Words\n(Excluding Common Words)', fontweight='bold')
    ax1.set_xlabel('Frequency')
    
    # Emotional content over time
    emotional_by_episode = df.group_by('episode_no').agg([
        (pl.col('has_exclamation').sum() / pl.len() * 100).alias('exclamation_pct'),
        (pl.col('has_question').sum() / pl.len() * 100).alias('question_pct')
    ]).sort('episode_no')
    
    ax2.plot(emotional_by_episode['episode_no'].to_list(), 
             emotional_by_episode['exclamation_pct'].to_list(), 
             'o-', label='Exclamations', linewidth=2)
    ax2.plot(emotional_by_episode['episode_no'].to_list(), 
             emotional_by_episode['question_pct'].to_list(), 
             'o-', label='Questions', linewidth=2)
    ax2.set_title('Emotional Expression Over Time', fontweight='bold')
    ax2.set_xlabel('Episode Number')
    ax2.set_ylabel('Percentage of Lines (%)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    analysis_text = """
    **Linguistic Analysis Insights:**
    
    • **Common Vocabulary**: Analysis reveals the most frequently used words beyond common articles
    • **Emotional Trends**: Tracking how emotional expression (exclamations/questions) varies across episodes
    • **Narrative Patterns**: Identifying recurring linguistic themes and character speech patterns
    
    The word frequency analysis helps understand the core vocabulary of the series,
    while emotional tracking shows how the tone evolves throughout different episodes.
    """
    
    return analysis_text, plot_to_base64(fig)

# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="Rick and Morty Transcript Analysis") as demo:
    gr.Markdown("# 🎬 Rick and Morty Transcript Analysis")
    gr.Markdown("### Comprehensive analysis of the Hugging Face Dataset: Prarabdha/Rick_and_Morty_Transcript")
    gr.Markdown("Explore character dynamics, episode structures, and storytelling patterns across the entire series!")
    
    with gr.Tab("📊 Overview Dashboard"):
        gr.Markdown("## Dataset Overview and Key Metrics")
        overview_btn = gr.Button("Generate Overview Dashboard")
        overview_output = gr.HTML()
        
        @overview_btn.click(inputs=[], outputs=[overview_output])
        def update_overview():
            img_data = create_overview_dashboard()
            return f'<img src="{img_data}" style="max-width:100%; height:auto;">'
    
    with gr.Tab("🎭 Episode Insights"):
        gr.Markdown("## Deep Dive into Key Episodes")
        gr.Markdown("""
        **Featured Episodes Analysis:**
        - **Episode 30**: Most talkative (859 lines)
        - **Episode 12**: Character-rich (96 unique characters!)
        - **Episode 6**: Long dialogues (90.2 avg length)  
        - **Episode 7**: Short dialogues (33.4 avg length)
        """)
        insights_btn = gr.Button("Generate Episode Insights")
        insights_output = gr.HTML()
        
        @insights_btn.click(inputs=[], outputs=[insights_output])
        def update_insights():
            img_data = create_episode_insights()
            return f'<img src="{img_data}" style="max-width:100%; height:auto;">'
    
    with gr.Tab("🔍 Character Analysis"):
        gr.Markdown("## Detailed Character Analysis")
        character_input = gr.Dropdown(
            choices=df['character'].unique().sort().to_list(),
            label="Select Character",
            value="Rick"
        )
        character_btn = gr.Button("Analyze Character")
        character_summary = gr.Markdown()
        character_viz = gr.HTML()
        
        @character_btn.click(inputs=[character_input], outputs=[character_summary, character_viz])
        def update_character(character_name):
            summary, img_data = create_character_analysis(character_name)
            viz_html = f'<img src="{img_data}" style="max-width:100%; height:auto;">' if img_data else ""
            return summary, viz_html
    
    with gr.Tab("🚨 Episode 25 Anomaly"):
        gr.Markdown("## Episode 25 Data Anomaly Discovery")
        anomaly_btn = gr.Button("Analyze Episode 25 Anomaly")
        anomaly_summary = gr.Markdown()
        anomaly_viz = gr.HTML()
        
        @anomaly_btn.click(inputs=[], outputs=[anomaly_summary, anomaly_viz])
        def update_anomaly():
            summary, img_data = create_episode_25_analysis()
            viz_html = f'<img src="{img_data}" style="max-width:100%; height:auto;">'
            return summary, viz_html
    
    with gr.Tab("📝 Word Analysis"):
        gr.Markdown("## Linguistic and Emotional Analysis")
        word_btn = gr.Button("Generate Word Analysis")
        word_summary = gr.Markdown()
        word_viz = gr.HTML()
        
        @word_btn.click(inputs=[], outputs=[word_summary, word_viz])
        def update_word_analysis():
            summary, img_data = create_word_analysis()
            viz_html = f'<img src="{img_data}" style="max-width:100%; height:auto;">'
            return summary, viz_html
    
    with gr.Tab("📈 Key Discoveries"):
        gr.Markdown("## Major Research Findings")
        gr.Markdown("""
        ### 🎯 Key Discoveries from Our Analysis:
        
        **1. Character Dominance Patterns:**
        - Rick dominates with 28.7% of all dialogue
        - Morty follows with 20.1% but shows more emotional expression
        - Top 5 characters account for 73.9% of total lines
        
        **2. Episode Structure Extremes:**
        - **Episode 30**: 859 lines (3.5x series average)
        - **Episode 12**: 96 unique characters (4.8x series average)  
        - **Episode 6**: 90.2 avg characters per line (1.4x average)
        - **Episode 7**: 33.4 avg characters per line (0.5x average)
        
        **3. Surprising Character Dynamics:**
        - Testicle Monster A has 19 lines in Episode 12 (2nd most!)
        - 53 alternate reality Ricks/Mortys appear in Episode 12
        - 47 characters in Episode 12 have only 1 line
        
        **4. Data Quality Insights:**
        - Episode 25 is an anomaly (93.8% stage directions)
        - Complete absence of emotional markers in Episode 25
        - Demonstrates importance of data preprocessing
        
        **5. Storytelling Innovation:**
        - 2.7x range in dialogue pacing across episodes
        - Willingness to experiment with extreme narrative structures
        - Balanced character consistency with creative risk-taking
        """)
    
    with gr.Tab("📋 Dataset Info"):
        gr.Markdown("## Dataset Information")
        gr.Markdown(f"""
        ### Hugging Face Dataset: Prarabdha/Rick_and_Morty_Transcript
        
        **Dataset Statistics:**
        - **Total Episodes**: {df['episode_no'].n_unique()}
        - **Total Lines**: {df.height:,}
        - **Unique Characters**: {df['character'].n_unique()}
        - **Total Dialogue Characters**: {df['dialogue_length'].sum():,}
        - **Average Line Length**: {df['dialogue_length'].mean():.1f} characters
        
        **Data Collection:**
        - Source: Rick and Morty animated series transcripts
        - Format: CSV with episode numbers, character names, and dialogue
        - Coverage: Multiple seasons of the series
        
        **Analysis Methodology:**
        - Data cleaning and preprocessing with Python Polars
        - Statistical analysis of character and episode patterns
        - Visualization of storytelling structures and trends
        - Identification of data anomalies and quality issues
        
        **Technical Stack:**
        - Python Polars for fast data processing
        - Matplotlib & Seaborn for visualizations
        - Gradio for interactive web interface
        - Hugging Face Datasets for data access
        """)

if __name__ == "__main__":
    demo.launch()