{ "cells": [ { "cell_type": "markdown", "id": "2745946d", "metadata": {}, "source": [ "# Multi-Talker Pipeline: Results Visualization & Comparison\n", "\n", "This notebook visualizes benchmark results from comparing three audio source separation approaches:\n", "- **ICA**: Simple, fast Independent Component Analysis\n", "- **Frankenstein**: ICA + English language bias for talker selection\n", "- **ICA+DeepLearning**: Two-pass (spatial + temporal) separation with SepFormer\n", "\n", "## Setup" ] }, { "cell_type": "code", "execution_count": null, "id": "318a1d0a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import json\n", "from pathlib import Path\n", "from datetime import datetime\n", "\n", "# Set style\n", "sns.set_style('whitegrid')\n", "plt.rcParams['figure.figsize'] = (12, 6)\n", "plt.rcParams['font.size'] = 10\n", "\n", "print(\"Imports successful!\")" ] }, { "cell_type": "markdown", "id": "b191c6b1", "metadata": {}, "source": [ "## Load Benchmark Results" ] }, { "cell_type": "code", "execution_count": null, "id": "ec482d67", "metadata": {}, "outputs": [], "source": [ "# Path to benchmark results\n", "RESULTS_DIR = Path('../benchmark_results')\n", "CSV_FILE = RESULTS_DIR / 'benchmark_results.csv'\n", "JSON_FILE = RESULTS_DIR / 'benchmark_results.json'\n", "\n", "# Load CSV\n", "if CSV_FILE.exists():\n", " df = pd.read_csv(CSV_FILE)\n", " print(f\"Loaded {len(df)} results from {CSV_FILE}\")\n", " print(f\"\\nColumns: {list(df.columns)}\")\n", " print(f\"\\nDataframe shape: {df.shape}\")\n", " df.head()" ] }, { "cell_type": "markdown", "id": "9fe2465a", "metadata": {}, "source": [ "## 1. Execution Time Comparison" ] }, { "cell_type": "code", "execution_count": null, "id": "3ac852c5", "metadata": {}, "outputs": [], "source": [ "# Filter only successful runs\n", "df_success = df[df['status'] == 'SUCCESS'].copy()\n", "\n", "if len(df_success) > 0:\n", " # Execution time by approach\n", " fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", " \n", " # Bar chart\n", " exec_times = df_success.groupby('approach')['execution_time_seconds'].mean()\n", " exec_times.plot(kind='bar', ax=axes[0], color=['#1f77b4', '#ff7f0e', '#2ca02c'])\n", " axes[0].set_title('Average Execution Time', fontsize=12, fontweight='bold')\n", " axes[0].set_ylabel('Time (seconds)')\n", " axes[0].set_xlabel('Approach')\n", " axes[0].tick_params(axis='x', rotation=45)\n", " \n", " # Add value labels on bars\n", " for i, v in enumerate(exec_times):\n", " axes[0].text(i, v + 5, f'{v:.0f}s', ha='center', va='bottom', fontweight='bold')\n", " \n", " # Box plot (distribution)\n", " df_success.boxplot(column='execution_time_seconds', by='approach', ax=axes[1])\n", " axes[1].set_title('Execution Time Distribution', fontsize=12, fontweight='bold')\n", " axes[1].set_ylabel('Time (seconds)')\n", " axes[1].set_xlabel('Approach')\n", " plt.suptitle('') # Remove the default title\n", " \n", " plt.tight_layout()\n", " plt.show()\n", " \n", " # Statistics\n", " print(\"\\n=== EXECUTION TIME STATISTICS ===\")\n", " print(df_success.groupby('approach')['execution_time_seconds'].describe().round(2))\n", "else:\n", " print(\"No successful runs to display\")" ] }, { "cell_type": "markdown", "id": "7758f016", "metadata": {}, "source": [ "## 2. Speedup Metric (Realtime Factor)" ] }, { "cell_type": "code", "execution_count": null, "id": "ac1f2073", "metadata": {}, "outputs": [], "source": [ "if len(df_success) > 0:\n", " fig, ax = plt.subplots(figsize=(10, 5))\n", " \n", " # Calculate speedup (audio_duration / execution_time)\n", " df_success['speedup'] = df_success['duration_seconds'] / df_success['execution_time_seconds']\n", " \n", " # Plot\n", " speedup_by_approach = df_success.groupby('approach')['speedup'].mean()\n", " speedup_by_approach.plot(kind='bar', ax=ax, color=['#1f77b4', '#ff7f0e', '#2ca02c'])\n", " \n", " ax.set_title('Average Speedup (Realtime Factor)', fontsize=12, fontweight='bold')\n", " ax.set_ylabel('Speedup (1x = realtime)')\n", " ax.set_xlabel('Approach')\n", " ax.axhline(y=1.0, color='red', linestyle='--', label='Realtime (1x)')\n", " ax.legend()\n", " ax.tick_params(axis='x', rotation=45)\n", " \n", " # Add value labels\n", " for i, v in enumerate(speedup_by_approach):\n", " ax.text(i, v + 0.01, f'{v:.3f}x', ha='center', va='bottom', fontweight='bold')\n", " \n", " plt.tight_layout()\n", " plt.show()\n", " \n", " print(\"\\n=== SPEEDUP STATISTICS ===\")\n", " print(f\"(1x = realtime, <1x = slower than realtime)\")\n", " print(speedup_by_approach.round(4))\n", "else:\n", " print(\"No data available\")" ] }, { "cell_type": "markdown", "id": "63fdc692", "metadata": {}, "source": [ "## 3. Speaker Detection Accuracy" ] }, { "cell_type": "code", "execution_count": null, "id": "b135268c", "metadata": {}, "outputs": [], "source": [ "if len(df_success) > 0:\n", " fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", " \n", " # Speaker count statistics\n", " speaker_stats = df_success.groupby('approach')['n_speakers'].agg(['mean', 'std', 'min', 'max'])\n", " \n", " # Bar chart with error bars\n", " speaker_stats['mean'].plot(kind='bar', ax=axes[0], color=['#1f77b4', '#ff7f0e', '#2ca02c'],\n", " yerr=speaker_stats['std'], capsize=5)\n", " axes[0].set_title('Average Speaker Count Detection', fontsize=12, fontweight='bold')\n", " axes[0].set_ylabel('Number of Speakers')\n", " axes[0].set_xlabel('Approach')\n", " axes[0].axhline(y=4, color='green', linestyle='--', label='Expected (4)')\n", " axes[0].set_ylim([3, 5])\n", " axes[0].legend()\n", " axes[0].tick_params(axis='x', rotation=45)\n", " \n", " # Distribution\n", " speaker_by_approach = [df_success[df_success['approach'] == app]['n_speakers'].values \n", " for app in df_success['approach'].unique()]\n", " axes[1].boxplot(speaker_by_approach, labels=df_success['approach'].unique())\n", " axes[1].set_title('Speaker Count Distribution', fontsize=12, fontweight='bold')\n", " axes[1].set_ylabel('Number of Speakers')\n", " axes[1].axhline(y=4, color='green', linestyle='--', label='Expected (4)')\n", " axes[1].legend()\n", " \n", " plt.tight_layout()\n", " plt.show()\n", " \n", " print(\"\\n=== SPEAKER COUNT STATISTICS ===\")\n", " print(speaker_stats.round(2))\n", "else:\n", " print(\"No data available\")" ] }, { "cell_type": "markdown", "id": "d7ea07dd", "metadata": {}, "source": [ "## 4. Per-File Performance Comparison" ] }, { "cell_type": "code", "execution_count": null, "id": "8d7a0d21", "metadata": {}, "outputs": [], "source": [ "if len(df_success) > 0:\n", " # Pivot table: files vs approaches\n", " exec_time_pivot = df_success.pivot_table(\n", " values='execution_time_seconds',\n", " index='input_file',\n", " columns='approach',\n", " aggfunc='mean'\n", " )\n", " \n", " print(\"\\n=== EXECUTION TIME BY FILE (seconds) ===\")\n", " print(exec_time_pivot.round(1))\n", " \n", " # Visualization\n", " if not exec_time_pivot.empty:\n", " fig, ax = plt.subplots(figsize=(12, 6))\n", " exec_time_pivot.plot(kind='bar', ax=ax, color=['#1f77b4', '#ff7f0e', '#2ca02c'])\n", " ax.set_title('Execution Time per Test File', fontsize=12, fontweight='bold')\n", " ax.set_ylabel('Time (seconds)')\n", " ax.set_xlabel('Input File')\n", " ax.legend(title='Approach')\n", " plt.tight_layout()\n", " plt.show()\n", "else:\n", " print(\"No data available\")" ] }, { "cell_type": "markdown", "id": "5daa4901", "metadata": {}, "source": [ "## 5. Heatmap: All Metrics Comparison" ] }, { "cell_type": "code", "execution_count": null, "id": "f618616b", "metadata": {}, "outputs": [], "source": [ "if len(df_success) > 0:\n", " # Create normalized metrics for heatmap\n", " heatmap_data = df_success.groupby('approach').agg({\n", " 'execution_time_seconds': 'mean',\n", " 'n_speakers': 'mean',\n", " 'speedup': 'mean',\n", " 'input_file': 'count' # Number of tests\n", " }).round(2)\n", " \n", " heatmap_data.columns = ['Avg Exec Time (s)', 'Avg Speaker Count', 'Speedup', 'Tests Run']\n", " \n", " # Normalize for visualization (0-1 scale)\n", " heatmap_normalized = heatmap_data.copy()\n", " for col in heatmap_normalized.columns:\n", " min_val = heatmap_normalized[col].min()\n", " max_val = heatmap_normalized[col].max()\n", " if max_val > min_val:\n", " heatmap_normalized[col] = (heatmap_normalized[col] - min_val) / (max_val - min_val)\n", " \n", " # Plot\n", " fig, ax = plt.subplots(figsize=(10, 5))\n", " sns.heatmap(heatmap_normalized.T, annot=heatmap_data.T, fmt='.2f', cmap='RdYlGn_r',\n", " cbar_kws={'label': 'Normalized Score'}, ax=ax)\n", " ax.set_title('Approach Comparison Heatmap', fontsize=12, fontweight='bold')\n", " ax.set_xlabel('Approach')\n", " plt.tight_layout()\n", " plt.show()\n", " \n", " print(\"\\n=== METRICS SUMMARY ===\")\n", " print(heatmap_data)\n", "else:\n", " print(\"No data available\")" ] }, { "cell_type": "markdown", "id": "1661fb4f", "metadata": {}, "source": [ "## 6. Approach Characteristics Summary" ] }, { "cell_type": "code", "execution_count": null, "id": "4aefd7a4", "metadata": {}, "outputs": [], "source": [ "# Summary characteristics of each approach\n", "approach_info = {\n", " 'ica': {\n", " 'Separation': 'FastICA',\n", " 'DoA Method': 'Mixing matrix energy ratios',\n", " 'Speed': 'Fast',\n", " 'ToI Priority': 'Spatial + Energy + Language',\n", " 'Best For': 'Clean environments'\n", " },\n", " 'frankenstein': {\n", " 'Separation': 'FastICA',\n", " 'DoA Method': 'None (amplitude panning)',\n", " 'Speed': 'Fast',\n", " 'ToI Priority': 'English language (heavy bias)',\n", " 'Best For': 'Multilingual targets'\n", " },\n", " 'ica_deeplearning': {\n", " 'Separation': 'PCA+ICA (Pass 1) + SepFormer (Pass 2)',\n", " 'DoA Method': 'Mixing matrix (Pass 1 only)',\n", " 'Speed': 'Slow/GPU-optimized',\n", " 'ToI Priority': 'Spatial + Energy + Language',\n", " 'Best For': 'Overlapping speech'\n", " }\n", "}\n", "\n", "print(\"\\n\" + \"=\"*70)\n", "print(\"APPROACH CHARACTERISTICS SUMMARY\")\n", "print(\"=\"*70)\n", "\n", "for approach, chars in approach_info.items():\n", " print(f\"\\n{approach.upper()}:\")\n", " print(\"-\" * 70)\n", " for key, value in chars.items():\n", " print(f\" {key:20s}: {value}\")" ] }, { "cell_type": "markdown", "id": "2406760e", "metadata": {}, "source": [ "## 7. Error Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "6ec56743", "metadata": {}, "outputs": [], "source": [ "df_failed = df[df['status'] == 'FAILED']\n", "\n", "if len(df_failed) > 0:\n", " print(f\"\\n=== FAILED RUNS: {len(df_failed)} ===\")\n", " for idx, row in df_failed.iterrows():\n", " print(f\"\\nFile: {row['input_file']}\")\n", " print(f\"Approach: {row['approach']}\")\n", " print(f\"Error: {row.get('error', 'Unknown')}\")\n", "else:\n", " print(\"\\n✅ No failed runs - all approaches successful!\")" ] }, { "cell_type": "markdown", "id": "670b3204", "metadata": {}, "source": [ "## 8. Recommendations" ] }, { "cell_type": "code", "execution_count": null, "id": "fc31327f", "metadata": {}, "outputs": [], "source": [ "if len(df_success) > 0:\n", " print(\"\\n\" + \"=\"*70)\n", " print(\"APPROACH SELECTION RECOMMENDATIONS\")\n", " print(\"=\"*70)\n", " \n", " # Fastest\n", " fastest = df_success.groupby('approach')['execution_time_seconds'].mean().idxmin()\n", " print(f\"\\n⚡ FASTEST: {fastest.upper()}\")\n", " print(f\" Avg time: {df_success[df_success['approach']==fastest]['execution_time_seconds'].mean():.1f}s\")\n", " print(f\" Use when: You need realtime or near-realtime processing\")\n", " \n", " # Best for multilingual\n", " print(f\"\\n🌍 BEST FOR MULTILINGUAL: frankenstein\")\n", " print(f\" Heavy English bias helps when target speaker is known to be English\")\n", " \n", " # Best for complex\n", " print(f\"\\n📊 BEST FOR OVERLAPPING SPEECH: ica_deeplearning\")\n", " print(f\" Two-pass approach handles temporal overlap better\")\n", " print(f\" Good for: multi-speaker conversations, active background\")\n", " \n", " # Balanced\n", " print(f\"\\n⚖️ BALANCED CHOICE: ica\")\n", " print(f\" Good performance + reasonable speed\")\n", " print(f\" Spatial information helps talker selection\")\n", " \n", " print(\"\\n\" + \"=\"*70)\n", "else:\n", " print(\"No data available for recommendations\")" ] }, { "cell_type": "markdown", "id": "400ed39e", "metadata": {}, "source": [ "## 9. Export Summary Report" ] }, { "cell_type": "code", "execution_count": null, "id": "728bc904", "metadata": {}, "outputs": [], "source": [ "# Create a summary report\n", "if len(df_success) > 0:\n", " summary_report = f\"\"\"\n", " \n", "=================================================================\n", "MULTI-TALKER AUDIO SOURCE SEPARATION BENCHMARK REPORT\n", "=================================================================\n", "\n", "Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n", "\n", "--- OVERALL STATISTICS ---\n", "Total Runs: {len(df)}\n", "Successful: {len(df_success)}\n", "Failed: {len(df_failed)}\n", "\n", "--- EXECUTION TIME ---\n", "{df_success.groupby('approach')['execution_time_seconds'].agg(['mean', 'min', 'max']).round(1).to_string()}\n", "\n", "--- SPEAKER DETECTION ---\n", "{df_success.groupby('approach')['n_speakers'].describe().round(2).to_string()}\n", "\n", "--- SPEEDUP (Realtime Factor) ---\n", "{df_success.groupby('approach')['speedup'].mean().round(4).to_string()}\n", "\n", "--- RECOMMENDATION ---\n", "Fastest Approach: {df_success.groupby('approach')['execution_time_seconds'].mean().idxmin().upper()}\n", "Best for Multilingual: frankenstein (English priority)\n", "Best for Overlapping: ica_deeplearning (Two-pass)\n", "Balanced: ica (Speed + Spatial Info)\n", "\n", "=================================================================\n", " \"\"\"\n", " \n", " print(summary_report)\n", " \n", " # Save to file\n", " report_path = RESULTS_DIR / 'BENCHMARK_REPORT.txt'\n", " with open(report_path, 'w') as f:\n", " f.write(summary_report)\n", " print(f\"\\n✅ Report saved to: {report_path}\")\n", "else:\n", " print(\"No data available\")" ] } ], "metadata": { "kernelspec": { "display_name": "audio-2026 (3.12.7)", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }