import matplotlib.pyplot as plt import matplotlib.patches as mpatches import pandas as pd import numpy as np from collections import Counter from typing import List, Dict import re def generate_dashboard(data): """Legacy function - kept for backwards compatibility""" return generate_comprehensive_dashboard(data, "Other") def extract_items_from_field(data: List[Dict], field_name: str) -> List[str]: """Extract and split items from semicolon-separated field""" items = [] for row in data: value = row.get(field_name, "") if value and isinstance(value, str): # Split by semicolon and clean parts = [p.strip() for p in value.split(';') if p.strip()] items.extend(parts) return items def generate_comprehensive_dashboard( data: List[Dict], interviewee_type: str ) -> plt.Figure: """ Generate comprehensive dashboard with multiple visualizations """ if not data or len(data) == 0: # Return empty figure with message fig, ax = plt.subplots(figsize=(10, 6)) ax.text(0.5, 0.5, 'No data available for visualization', ha='center', va='center', fontsize=14) ax.axis('off') return fig df = pd.DataFrame(data) # Determine number of subplots based on interviewee type if interviewee_type == "HCP": fig = create_hcp_dashboard(df) elif interviewee_type == "Patient": fig = create_patient_dashboard(df) else: fig = create_general_dashboard(df) plt.tight_layout() return fig def create_hcp_dashboard(df: pd.DataFrame) -> plt.Figure: """Create dashboard for HCP interviews""" fig, axes = plt.subplots(2, 2, figsize=(14, 10)) fig.suptitle('Healthcare Professional Interview Analysis', fontsize=16, fontweight='bold') # 1. Quality Score Distribution ax1 = axes[0, 0] if 'Quality Score' in df.columns: quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna() if len(quality_scores) > 0: ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7) ax1.axvline(quality_scores.mean(), color='red', linestyle='--', label=f'Mean: {quality_scores.mean():.2f}') ax1.set_xlabel('Quality Score') ax1.set_ylabel('Frequency') ax1.set_title('Transcript Quality Distribution') ax1.legend() ax1.grid(axis='y', alpha=0.3) # 2. Top Diagnoses ax2 = axes[0, 1] if 'Diagnoses' in df.columns: diagnoses = extract_items_from_field(df.to_dict('records'), 'Diagnoses') if diagnoses: diagnosis_counts = Counter(diagnoses) top_diagnoses = dict(diagnosis_counts.most_common(8)) if top_diagnoses: labels = list(top_diagnoses.keys()) # Truncate long labels labels = [label[:30] + '...' if len(label) > 30 else label for label in labels] values = list(top_diagnoses.values()) bars = ax2.barh(labels, values, color='#2ecc71', edgecolor='black') ax2.set_xlabel('Frequency') ax2.set_title('Most Common Diagnoses') ax2.invert_yaxis() # Add value labels for i, bar in enumerate(bars): width = bar.get_width() ax2.text(width, bar.get_y() + bar.get_height()/2, f' {int(width)}', ha='left', va='center', fontsize=9) # 3. Prescription Analysis ax3 = axes[1, 0] if 'Prescriptions' in df.columns: prescriptions = extract_items_from_field(df.to_dict('records'), 'Prescriptions') if prescriptions: rx_counts = Counter(prescriptions) top_rx = dict(rx_counts.most_common(8)) if top_rx: labels = list(top_rx.keys()) labels = [label[:30] + '...' if len(label) > 30 else label for label in labels] values = list(top_rx.values()) bars = ax3.barh(labels, values, color='#e74c3c', edgecolor='black') ax3.set_xlabel('Frequency') ax3.set_title('Most Mentioned Prescriptions') ax3.invert_yaxis() for i, bar in enumerate(bars): width = bar.get_width() ax3.text(width, bar.get_y() + bar.get_height()/2, f' {int(width)}', ha='left', va='center', fontsize=9) # 4. Word Count by Transcript ax4 = axes[1, 1] if 'Word Count' in df.columns and 'Transcript ID' in df.columns: word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna() transcript_ids = df['Transcript ID'][:len(word_counts)] if len(word_counts) > 0: bars = ax4.bar(range(len(word_counts)), word_counts, color='#9b59b6', edgecolor='black', alpha=0.7) ax4.set_xlabel('Transcript') ax4.set_ylabel('Word Count') ax4.set_title('Interview Length by Transcript') ax4.set_xticks(range(len(word_counts))) ax4.set_xticklabels(transcript_ids, rotation=45, ha='right') ax4.grid(axis='y', alpha=0.3) # Add mean line ax4.axhline(word_counts.mean(), color='red', linestyle='--', label=f'Average: {int(word_counts.mean())}') ax4.legend() return fig def create_patient_dashboard(df: pd.DataFrame) -> plt.Figure: """Create dashboard for Patient interviews""" fig, axes = plt.subplots(2, 2, figsize=(14, 10)) fig.suptitle('Patient Interview Analysis', fontsize=16, fontweight='bold') # 1. Quality Score Distribution ax1 = axes[0, 0] if 'Quality Score' in df.columns: quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna() if len(quality_scores) > 0: ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7) ax1.axvline(quality_scores.mean(), color='red', linestyle='--', label=f'Mean: {quality_scores.mean():.2f}') ax1.set_xlabel('Quality Score') ax1.set_ylabel('Frequency') ax1.set_title('Transcript Quality Distribution') ax1.legend() ax1.grid(axis='y', alpha=0.3) # 2. Top Symptoms ax2 = axes[0, 1] if 'Primary Symptoms' in df.columns: symptoms = extract_items_from_field(df.to_dict('records'), 'Primary Symptoms') if symptoms: symptom_counts = Counter(symptoms) top_symptoms = dict(symptom_counts.most_common(8)) if top_symptoms: labels = list(top_symptoms.keys()) labels = [label[:30] + '...' if len(label) > 30 else label for label in labels] values = list(top_symptoms.values()) bars = ax2.barh(labels, values, color='#e67e22', edgecolor='black') ax2.set_xlabel('Frequency') ax2.set_title('Most Common Symptoms') ax2.invert_yaxis() for i, bar in enumerate(bars): width = bar.get_width() ax2.text(width, bar.get_y() + bar.get_height()/2, f' {int(width)}', ha='left', va='center', fontsize=9) # 3. Patient Concerns ax3 = axes[1, 0] if 'Main Concerns' in df.columns: concerns = extract_items_from_field(df.to_dict('records'), 'Main Concerns') if concerns: concern_counts = Counter(concerns) top_concerns = dict(concern_counts.most_common(6)) if top_concerns: # Create word cloud style pie chart labels = list(top_concerns.keys()) labels = [label[:25] + '...' if len(label) > 25 else label for label in labels] sizes = list(top_concerns.values()) colors_list = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#f9ca24', '#6c5ce7', '#a29bfe'] ax3.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors_list[:len(sizes)]) ax3.set_title('Distribution of Patient Concerns') # 4. Side Effects ax4 = axes[1, 1] if 'Side Effects' in df.columns: side_effects = extract_items_from_field(df.to_dict('records'), 'Side Effects') if side_effects: se_counts = Counter(side_effects) top_se = dict(se_counts.most_common(6)) if top_se: labels = list(top_se.keys()) labels = [label[:30] + '...' if len(label) > 30 else label for label in labels] values = list(top_se.values()) bars = ax4.barh(labels, values, color='#e74c3c', edgecolor='black') ax4.set_xlabel('Frequency') ax4.set_title('Reported Side Effects') ax4.invert_yaxis() for i, bar in enumerate(bars): width = bar.get_width() ax4.text(width, bar.get_y() + bar.get_height()/2, f' {int(width)}', ha='left', va='center', fontsize=9) else: ax4.text(0.5, 0.5, 'No side effects reported', ha='center', va='center', transform=ax4.transAxes, fontsize=12) ax4.axis('off') return fig def create_general_dashboard(df: pd.DataFrame) -> plt.Figure: """Create general dashboard""" fig, axes = plt.subplots(2, 2, figsize=(14, 10)) fig.suptitle('General Interview Analysis', fontsize=16, fontweight='bold') # 1. Quality Score Distribution ax1 = axes[0, 0] if 'Quality Score' in df.columns: quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna() if len(quality_scores) > 0: ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7) ax1.axvline(quality_scores.mean(), color='red', linestyle='--', label=f'Mean: {quality_scores.mean():.2f}') ax1.set_xlabel('Quality Score') ax1.set_ylabel('Frequency') ax1.set_title('Transcript Quality Distribution') ax1.legend() ax1.grid(axis='y', alpha=0.3) # 2. Word Count Distribution ax2 = axes[0, 1] if 'Word Count' in df.columns: word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna() if len(word_counts) > 0: ax2.hist(word_counts, bins=15, color='#2ecc71', edgecolor='black', alpha=0.7) ax2.set_xlabel('Word Count') ax2.set_ylabel('Frequency') ax2.set_title('Interview Length Distribution') ax2.grid(axis='y', alpha=0.3) # 3. Processing Summary ax3 = axes[1, 0] if 'Quality Score' in df.columns: quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna() categories = ['Excellent\n(>0.8)', 'Good\n(0.6-0.8)', 'Fair\n(0.4-0.6)', 'Poor\n(<0.4)'] counts = [ sum(quality_scores > 0.8), sum((quality_scores >= 0.6) & (quality_scores <= 0.8)), sum((quality_scores >= 0.4) & (quality_scores < 0.6)), sum(quality_scores < 0.4) ] colors_list = ['#2ecc71', '#f39c12', '#e67e22', '#e74c3c'] bars = ax3.bar(categories, counts, color=colors_list, edgecolor='black', alpha=0.7) ax3.set_ylabel('Number of Transcripts') ax3.set_title('Quality Score Categories') ax3.grid(axis='y', alpha=0.3) # Add value labels for bar in bars: height = bar.get_height() if height > 0: ax3.text(bar.get_x() + bar.get_width()/2., height, f'{int(height)}', ha='center', va='bottom', fontsize=10) # 4. Summary Statistics Table ax4 = axes[1, 1] ax4.axis('off') stats_data = [] if 'Transcript ID' in df.columns: stats_data.append(['Total Transcripts', str(len(df))]) if 'Quality Score' in df.columns: quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna() if len(quality_scores) > 0: stats_data.append(['Avg Quality Score', f"{quality_scores.mean():.2f}"]) stats_data.append(['Min Quality Score', f"{quality_scores.min():.2f}"]) stats_data.append(['Max Quality Score', f"{quality_scores.max():.2f}"]) if 'Word Count' in df.columns: word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna() if len(word_counts) > 0: stats_data.append(['Avg Word Count', f"{int(word_counts.mean()):,}"]) stats_data.append(['Total Words', f"{int(word_counts.sum()):,}"]) if stats_data: table = ax4.table(cellText=stats_data, cellLoc='left', colWidths=[0.5, 0.3], loc='center', colLabels=['Metric', 'Value']) table.auto_set_font_size(False) table.set_fontsize(11) table.scale(1, 2) # Style the table for i in range(len(stats_data) + 1): if i == 0: table[(i, 0)].set_facecolor('#34495e') table[(i, 1)].set_facecolor('#34495e') table[(i, 0)].set_text_props(weight='bold', color='white') table[(i, 1)].set_text_props(weight='bold', color='white') else: if i % 2 == 0: table[(i, 0)].set_facecolor('#ecf0f1') table[(i, 1)].set_facecolor('#ecf0f1') ax4.set_title('Summary Statistics', fontsize=12, fontweight='bold', pad=20) return fig