Spaces:

risky-conversations
/

Visualizer

Sleeping

App Files Files Community

acmc commited on Jun 16, 2025

Commit

b442037

verified ·

1 Parent(s): 21a08b0

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +311 -212

streamlit_app.py CHANGED Viewed

@@ -212,8 +212,8 @@ def main():
     st.title("🔍 Complexity Metrics Explorer")
     st.markdown("Interactive visualization of conversation complexity metrics across different dataset types.")
-    # Dataset selection at the top
-    st.header("🗂️ Dataset Selection")
     # Available datasets
     available_datasets = [
@@ -223,36 +223,31 @@ def main():
         "Custom..."
     ]
-    col1, col2 = st.columns([3, 1])
-    with col1:
-        selected_option = st.selectbox(
-            "Select Dataset",
-            options=available_datasets,
-            index=0,  # Default to reduced dataset
-            help="Choose which dataset to analyze",
-            format_func=lambda x: x.split('/')[-1] if x != "Custom..." else x  # Show only the dataset name part
-        )
-    with col2:
-        # Add refresh button
-        if st.button("🔄 Refresh Data", help="Clear cache and reload dataset"):
-            st.cache_data.clear()
-            st.rerun()
     # Handle custom dataset input
     if selected_option == "Custom...":
-        selected_dataset = st.text_input(
             "Custom Dataset Name",
             value="risky-conversations/jailbreaks_dataset_with_results_reduced",
             help="Enter the full dataset name (e.g., 'risky-conversations/jailbreaks_dataset_with_results_reduced')"
         )
         if not selected_dataset.strip():
-            st.warning("Please enter a dataset name")
             st.stop()
     else:
         selected_dataset = selected_option
     # Load data
     with st.spinner(f"Loading dataset: {selected_dataset}..."):
         try:
@@ -280,52 +275,48 @@ def main():
     if not data_loaded:
         st.stop()
-    # Controls at the top of the page
-    st.header("🎛️ Analysis Controls")
     # Dataset type filter
     dataset_types = df['type'].unique()
-    col1, col2 = st.columns(2)
-    with col1:
-        selected_types = st.multiselect(
-            "Select Dataset Types",
-            options=dataset_types,
-            default=dataset_types,
-            help="Filter by conversation type"
-        )
     # Role filter
-    with col2:
-        if 'turn.role' in df_exploded.columns:
-            roles = df_exploded['turn.role'].dropna().unique()
-            # Assert only user and assistant roles exist
-            expected_roles = {'user', 'assistant'}
-            actual_roles = set(roles)
-            assert actual_roles.issubset(expected_roles), f"Unexpected roles found: {actual_roles - expected_roles}. Expected only 'user' and 'assistant'"
-            st.subheader("👥 Role Filter")
-            col2_1, col2_2 = st.columns(2)
-            with col2_1:
-                include_user = st.checkbox("User", value=True, help="Include user turns")
-            with col2_2:
-                include_assistant = st.checkbox("Assistant", value=True, help="Include assistant turns")
-            # Build selected roles list
-            selected_roles = []
-            if include_user and 'user' in roles:
-                selected_roles.append('user')
-            if include_assistant and 'assistant' in roles:
-                selected_roles.append('assistant')
-            # Show selection info
-            if selected_roles:
-                st.success(f"Including: {', '.join(selected_roles)}")
-            else:
-                st.warning("No roles selected")
         else:
-            selected_roles = None
     # Filter data based on selections
     filtered_df = df[df['type'].isin(selected_types)] if selected_types else df
@@ -343,7 +334,7 @@ def main():
         st.stop()
     # Metric selection
-    st.header("📊 Metrics Selection")
     # Dynamic metric categorization based on common patterns
     def categorize_metrics(metrics):
@@ -386,28 +377,24 @@ def main():
     metric_categories = categorize_metrics(available_metrics)
     # Metric selection interface
-    selection_mode = st.radio(
         "Selection Mode",
         ["By Category", "Search/Filter", "Select All"],
-        help="Choose how to select metrics",
-        horizontal=True
     )
     if selection_mode == "By Category":
-        col1, col2 = st.columns([2, 1])
-        with col1:
-            selected_category = st.selectbox(
-                "Metric Category",
-                options=list(metric_categories.keys()),
-                help=f"Found {len(metric_categories)} categories"
-            )
         available_in_category = metric_categories[selected_category]
         default_selection = available_in_category[:5] if len(available_in_category) > 5 else available_in_category
         # Add select all button for category
-        col1, col2 = st.columns(2)
         with col1:
             if st.button("Select All", key="select_all_category"):
                 st.session_state.selected_metrics_category = available_in_category
@@ -419,7 +406,7 @@ def main():
         if "selected_metrics_category" not in st.session_state:
             st.session_state.selected_metrics_category = default_selection
-        selected_metrics = st.multiselect(
             f"Select Metrics ({len(available_in_category)} available)",
             options=available_in_category,
             default=st.session_state.selected_metrics_category,
@@ -428,7 +415,7 @@ def main():
         )
     elif selection_mode == "Search/Filter":
-        search_term = st.text_input(
             "Search Metrics",
             placeholder="Enter keywords to filter metrics...",
             help="Search for metrics containing specific terms"
@@ -439,10 +426,10 @@ def main():
         else:
             filtered_metrics = available_metrics
-        st.write(f"Found {len(filtered_metrics)} metrics")
         # Add select all button for search results
-        col1, col2 = st.columns(2)
         with col1:
             if st.button("Select All", key="select_all_search"):
                 st.session_state.selected_metrics_search = filtered_metrics
@@ -454,7 +441,7 @@ def main():
         if "selected_metrics_search" not in st.session_state:
             st.session_state.selected_metrics_search = filtered_metrics[:5] if len(filtered_metrics) > 5 else filtered_metrics[:3]
-        selected_metrics = st.multiselect(
             "Select Metrics",
             options=filtered_metrics,
             default=st.session_state.selected_metrics_search,
@@ -464,7 +451,7 @@ def main():
     else:  # Select All
         # Add select all button for all metrics
-        col1, col2 = st.columns(2)
         with col1:
             if st.button("Select All", key="select_all_all"):
                 st.session_state.selected_metrics_all = available_metrics
@@ -476,7 +463,7 @@ def main():
         if "selected_metrics_all" not in st.session_state:
             st.session_state.selected_metrics_all = available_metrics[:10]  # Limit default to first 10 for performance
-        selected_metrics = st.multiselect(
             f"All Metrics ({len(available_metrics)} total)",
             options=available_metrics,
             default=st.session_state.selected_metrics_all,
@@ -486,18 +473,18 @@ def main():
     # Show selection summary
     if selected_metrics:
-        st.success(f"Selected {len(selected_metrics)} metrics")
         # Performance warning for large selections
         if len(selected_metrics) > 20:
-            st.warning(f"⚠️ Large selection ({len(selected_metrics)} metrics) may impact performance")
         elif len(selected_metrics) > 50:
-            st.error(f"🚨 Very large selection ({len(selected_metrics)} metrics) - consider reducing for better performance")
     else:
-        st.warning("No metrics selected")
     # Metric info expander
-    with st.expander("ℹ️ Metric Information", expanded=False):
         st.write(f"**Total Available Metrics:** {len(available_metrics)}")
         st.write(f"**Categories Found:** {len(metric_categories)}")
@@ -506,8 +493,6 @@ def main():
             for i, metric in enumerate(available_metrics, 1):
                 st.write(f"{i}. `{metric}`")
-    st.divider()  # Visual separator before main content
     # Main content tabs
     tab1, tab2, tab3, tab4, tab5 = st.tabs(["📊 Distributions", "🔗 Correlations", "📈 Comparisons", "🔍 Conversation", "🎯 Details"])
@@ -704,6 +689,7 @@ def main():
         # Display conversation metadata
         st.subheader("📋 Conversation Overview")
         col1, col2, col3, col4 = st.columns(4)
         with col1:
             st.metric("Type", selected_conversation['type'])
@@ -718,6 +704,68 @@ def main():
             assistant_turns = roles.count('assistant')
             st.metric("User/Assistant", f"{user_turns}/{assistant_turns}")
         # Get conversation turns with metrics
         conv_turns_data = filtered_df_exploded[filtered_df_exploded.index.isin(
             filtered_df_exploded[filtered_df_exploded.index // len(filtered_df_exploded) * len(filtered_df) +
@@ -739,146 +787,197 @@ def main():
         # Simpler approach: get all turns from the conversation directly
         conversation_turns = selected_conversation.get('conversation', [])
-        if conversation_turns:
-            # Display conversation content
-            st.subheader("💬 Conversation Content")
-            # Show/hide content toggle
-            show_content = st.checkbox("Show conversation content", value=True)
-            if show_content:
-                for i, turn in enumerate(conversation_turns):
-                    role = turn.get('role', 'unknown')
-                    content = turn.get('content', 'No content')
-                    # Style based on role
-                    if role == 'user':
-                        st.markdown(f"**👤 User (Turn {i+1}):**")
-                        st.info(content)
-                    elif role == 'assistant':
-                        st.markdown(f"**🤖 Assistant (Turn {i+1}):**")
-                        st.success(content)
-                    else:
-                        st.markdown(f"**❓ {role.title()} (Turn {i+1}):**")
-                        st.warning(content)
-            # Display turn-level metrics if available
-            st.subheader("📊 Turn-Level Metrics")
-            if selected_metrics:
-                # Get actual turn-level data for this conversation
-                # Find matching turns in the exploded dataframe
-                conv_turn_metrics = []
-                # Simple approach: try to match turns by content or position
-                # This is a best-effort approach since exact matching is complex
-                turn_metric_columns = [f"turn.turn_metrics.{m}" for m in selected_metrics]
-                available_columns = [col for col in turn_metric_columns if col in filtered_df_exploded.columns]
-                if available_columns:
-                    # Try to get metrics for turns from this conversation type
-                    type_turns = filtered_df_exploded[filtered_df_exploded['type'] == selected_conversation['type']]
-                    # Take a sample of turns for this conversation type (since exact matching is complex)
-                    sample_size = min(len(conversation_turns), len(type_turns))
-                    if sample_size > 0:
-                        sample_turns = type_turns.head(sample_size)
-                        # Create metrics table
-                        metrics_display_data = []
-                        for i, (_, turn_row) in enumerate(sample_turns.iterrows()):
-                            if i < len(conversation_turns):
-                                turn_data = {
-                                    'Turn': i + 1,
-                                    'Role': conversation_turns[i].get('role', 'unknown')
-                                }
-                                # Add actual metric values
-                                for col in available_columns:
-                                    metric_name = col.replace('turn.turn_metrics.', '')
-                                    friendly_name = get_human_friendly_metric_name(metric_name)
-                                    value = turn_row.get(col, 'N/A')
-                                    if pd.notna(value) and isinstance(value, (int, float)):
-                                        turn_data[friendly_name] = round(value, 3)
                                     else:
-                                        turn_data[friendly_name] = 'N/A'
-                                metrics_display_data.append(turn_data)
-                        if metrics_display_data:
-                            metrics_df = pd.DataFrame(metrics_display_data)
-                            st.dataframe(metrics_df, use_container_width=True)
-                            # Plot metrics over turns with real data
-                            st.subheader("📈 Metrics Over Turns")
-                            fig = go.Figure()
-                            # Add traces for each selected metric (real data)
-                            for col in available_columns[:5]:  # Limit to first 5 for readability
-                                metric_name = col.replace('turn.turn_metrics.', '')
-                                friendly_name = get_human_friendly_metric_name(metric_name)
-                                # Get values for this metric
-                                y_values = []
-                                for _, turn_row in sample_turns.iterrows():
-                                    value = turn_row.get(col, None)
-                                    if pd.notna(value) and isinstance(value, (int, float)):
-                                        y_values.append(value)
                                     else:
-                                        y_values.append(None)
-                                if any(v is not None for v in y_values):
-                                    fig.add_trace(go.Scatter(
-                                        x=list(range(1, len(y_values) + 1)),
-                                        y=y_values,
-                                        mode='lines+markers',
-                                        name=friendly_name,
-                                        line=dict(width=2),
-                                        marker=dict(size=8),
-                                        connectgaps=False
-                                    ))
-                            if fig.data:  # Only show if we have data
-                                fig.update_layout(
-                                    title="Complexity Metrics Across Conversation Turns",
-                                    xaxis_title="Turn Number",
-                                    yaxis_title="Metric Value",
-                                    height=400,
-                                    hovermode='x unified'
-                                )
-                                st.plotly_chart(fig, use_container_width=True)
-                            else:
-                                st.info("No numeric metric data available to plot for this conversation type.")
                         else:
-                            st.info("No matching turn-level metrics found for this conversation.")
-                    else:
-                        st.info("No turn-level data available for this conversation type.")
-                else:
-                    st.warning("No turn-level metrics available in the dataset for the selected metrics.")
-                # Show raw turn content with role highlighting
-                with st.expander("🔍 Detailed Turn Analysis", expanded=False):
-                    for i, turn in enumerate(conversation_turns):
-                        role = turn.get('role', 'unknown')
-                        content = turn.get('content', 'No content')
-                        st.markdown(f"**Turn {i+1} ({role}):**")
-                        st.text_area(
-                            f"Content",
-                            content,
-                            height=100,
-                            key=f"turn_content_{i}",
-                            disabled=True
-                        )
-                        # Show turn statistics
-                        st.caption(f"Characters: {len(content)} | Words: {len(content.split())} | Role: {role}")
-                        st.divider()
             else:
-                st.warning("Select some metrics to see turn-level analysis.")
         else:
             st.warning("No conversation data available for the selected conversation.")

     st.title("🔍 Complexity Metrics Explorer")
     st.markdown("Interactive visualization of conversation complexity metrics across different dataset types.")
+    # Dataset selection
+    st.sidebar.header("🗂️ Dataset Selection")
     # Available datasets
     available_datasets = [
         "Custom..."
     ]
+    selected_option = st.sidebar.selectbox(
+        "Select Dataset",
+        options=available_datasets,
+        index=0,  # Default to reduced dataset
+        help="Choose which dataset to analyze"
+    )
     # Handle custom dataset input
     if selected_option == "Custom...":
+        selected_dataset = st.sidebar.text_input(
             "Custom Dataset Name",
             value="risky-conversations/jailbreaks_dataset_with_results_reduced",
             help="Enter the full dataset name (e.g., 'risky-conversations/jailbreaks_dataset_with_results_reduced')"
         )
         if not selected_dataset.strip():
+            st.sidebar.warning("Please enter a dataset name")
             st.stop()
     else:
         selected_dataset = selected_option
+    # Add refresh button
+    if st.sidebar.button("🔄 Refresh Data", help="Clear cache and reload dataset"):
+        st.cache_data.clear()
+        st.rerun()
     # Load data
     with st.spinner(f"Loading dataset: {selected_dataset}..."):
         try:
     if not data_loaded:
         st.stop()
+    # Sidebar controls
+    st.sidebar.header("🎛️ Controls")
     # Dataset type filter
     dataset_types = df['type'].unique()
+    selected_types = st.sidebar.multiselect(
+        "Select Dataset Types",
+        options=dataset_types,
+        default=dataset_types,
+        help="Filter by conversation type"
+    )
     # Role filter
+    if 'turn.role' in df_exploded.columns:
+        roles = df_exploded['turn.role'].dropna().unique()
+        # Assert only user and assistant roles exist
+        expected_roles = {'user', 'assistant'}
+        actual_roles = set(roles)
+        assert actual_roles.issubset(expected_roles), f"Unexpected roles found: {actual_roles - expected_roles}. Expected only 'user' and 'assistant'"
+        st.sidebar.subheader("👥 Role Filter")
+        col1, col2 = st.sidebar.columns(2)
+        with col1:
+            include_user = st.checkbox("User", value=True, help="Include user turns")
+        with col2:
+            include_assistant = st.checkbox("Assistant", value=True, help="Include assistant turns")
+        # Build selected roles list
+        selected_roles = []
+        if include_user and 'user' in roles:
+            selected_roles.append('user')
+        if include_assistant and 'assistant' in roles:
+            selected_roles.append('assistant')
+        # Show selection info
+        if selected_roles:
+            st.sidebar.success(f"Including: {', '.join(selected_roles)}")
         else:
+            st.sidebar.warning("No roles selected")
+    else:
+        selected_roles = None
     # Filter data based on selections
     filtered_df = df[df['type'].isin(selected_types)] if selected_types else df
         st.stop()
     # Metric selection
+    st.sidebar.header("📊 Metrics")
     # Dynamic metric categorization based on common patterns
     def categorize_metrics(metrics):
     metric_categories = categorize_metrics(available_metrics)
     # Metric selection interface
+    selection_mode = st.sidebar.radio(
         "Selection Mode",
         ["By Category", "Search/Filter", "Select All"],
+        help="Choose how to select metrics"
     )
     if selection_mode == "By Category":
+        selected_category = st.sidebar.selectbox(
+            "Metric Category",
+            options=list(metric_categories.keys()),
+            help=f"Found {len(metric_categories)} categories"
+        )
         available_in_category = metric_categories[selected_category]
         default_selection = available_in_category[:5] if len(available_in_category) > 5 else available_in_category
         # Add select all button for category
+        col1, col2 = st.sidebar.columns(2)
         with col1:
             if st.button("Select All", key="select_all_category"):
                 st.session_state.selected_metrics_category = available_in_category
         if "selected_metrics_category" not in st.session_state:
             st.session_state.selected_metrics_category = default_selection
+        selected_metrics = st.sidebar.multiselect(
             f"Select Metrics ({len(available_in_category)} available)",
             options=available_in_category,
             default=st.session_state.selected_metrics_category,
         )
     elif selection_mode == "Search/Filter":
+        search_term = st.sidebar.text_input(
             "Search Metrics",
             placeholder="Enter keywords to filter metrics...",
             help="Search for metrics containing specific terms"
         else:
             filtered_metrics = available_metrics
+        st.sidebar.write(f"Found {len(filtered_metrics)} metrics")
         # Add select all button for search results
+        col1, col2 = st.sidebar.columns(2)
         with col1:
             if st.button("Select All", key="select_all_search"):
                 st.session_state.selected_metrics_search = filtered_metrics
         if "selected_metrics_search" not in st.session_state:
             st.session_state.selected_metrics_search = filtered_metrics[:5] if len(filtered_metrics) > 5 else filtered_metrics[:3]
+        selected_metrics = st.sidebar.multiselect(
             "Select Metrics",
             options=filtered_metrics,
             default=st.session_state.selected_metrics_search,
     else:  # Select All
         # Add select all button for all metrics
+        col1, col2 = st.sidebar.columns(2)
         with col1:
             if st.button("Select All", key="select_all_all"):
                 st.session_state.selected_metrics_all = available_metrics
         if "selected_metrics_all" not in st.session_state:
             st.session_state.selected_metrics_all = available_metrics[:10]  # Limit default to first 10 for performance
+        selected_metrics = st.sidebar.multiselect(
             f"All Metrics ({len(available_metrics)} total)",
             options=available_metrics,
             default=st.session_state.selected_metrics_all,
     # Show selection summary
     if selected_metrics:
+        st.sidebar.success(f"Selected {len(selected_metrics)} metrics")
         # Performance warning for large selections
         if len(selected_metrics) > 20:
+            st.sidebar.warning(f"⚠️ Large selection ({len(selected_metrics)} metrics) may impact performance")
         elif len(selected_metrics) > 50:
+            st.sidebar.error(f"🚨 Very large selection ({len(selected_metrics)} metrics) - consider reducing for better performance")
     else:
+        st.sidebar.warning("No metrics selected")
     # Metric info expander
+    with st.sidebar.expander("ℹ️ Metric Information", expanded=False):
         st.write(f"**Total Available Metrics:** {len(available_metrics)}")
         st.write(f"**Categories Found:** {len(metric_categories)}")
             for i, metric in enumerate(available_metrics, 1):
                 st.write(f"{i}. `{metric}`")
     # Main content tabs
     tab1, tab2, tab3, tab4, tab5 = st.tabs(["📊 Distributions", "🔗 Correlations", "📈 Comparisons", "🔍 Conversation", "🎯 Details"])
         # Display conversation metadata
         st.subheader("📋 Conversation Overview")
+        # First row - basic info
         col1, col2, col3, col4 = st.columns(4)
         with col1:
             st.metric("Type", selected_conversation['type'])
             assistant_turns = roles.count('assistant')
             st.metric("User/Assistant", f"{user_turns}/{assistant_turns}")
+        # Second row - additional metadata
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            provenance = selected_conversation.get('provenance_dataset', 'Unknown')
+            st.metric("Dataset Source", provenance)
+        with col2:
+            language = selected_conversation.get('language', 'Unknown')
+            st.metric("Language", language.upper() if language else 'Unknown')
+        with col3:
+            timestamp = selected_conversation.get('timestamp', None)
+            if timestamp:
+                # Handle different timestamp formats
+                if isinstance(timestamp, str):
+                    st.metric("Timestamp", timestamp)
+                else:
+                    st.metric("Timestamp", str(timestamp))
+            else:
+                st.metric("Timestamp", "Not Available")
+        # Add toxicity summary
+        conversation_turns_temp = selected_conversation.get('conversation', [])
+        if hasattr(conversation_turns_temp, 'tolist'):
+            conversation_turns_temp = conversation_turns_temp.tolist()
+        elif conversation_turns_temp is None:
+            conversation_turns_temp = []
+        if len(conversation_turns_temp) > 0:
+            # Calculate overall toxicity statistics
+            all_toxicities = []
+            for turn in conversation_turns_temp:
+                toxicities = turn.get('toxicities', {})
+                if toxicities and 'toxicity' in toxicities:
+                    all_toxicities.append(toxicities['toxicity'])
+            if all_toxicities:
+                avg_toxicity = sum(all_toxicities) / len(all_toxicities)
+                max_toxicity = max(all_toxicities)
+                st.markdown("**🔍 Toxicity Summary:**")
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    # Color code average toxicity
+                    if avg_toxicity > 0.5:
+                        st.metric("Average Toxicity", f"{avg_toxicity:.4f}", delta="HIGH", delta_color="inverse")
+                    elif avg_toxicity > 0.1:
+                        st.metric("Average Toxicity", f"{avg_toxicity:.4f}", delta="MED", delta_color="off")
+                    else:
+                        st.metric("Average Toxicity", f"{avg_toxicity:.4f}", delta="LOW", delta_color="normal")
+                with col2:
+                    # Color code max toxicity
+                    if max_toxicity > 0.5:
+                        st.metric("Max Toxicity", f"{max_toxicity:.4f}", delta="HIGH", delta_color="inverse")
+                    elif max_toxicity > 0.1:
+                        st.metric("Max Toxicity", f"{max_toxicity:.4f}", delta="MED", delta_color="off")
+                    else:
+                        st.metric("Max Toxicity", f"{max_toxicity:.4f}", delta="LOW", delta_color="normal")
+                with col3:
+                    high_tox_turns = sum(1 for t in all_toxicities if t > 0.5)
+                    st.metric("High Toxicity Turns", high_tox_turns)
         # Get conversation turns with metrics
         conv_turns_data = filtered_df_exploded[filtered_df_exploded.index.isin(
             filtered_df_exploded[filtered_df_exploded.index // len(filtered_df_exploded) * len(filtered_df) +
         # Simpler approach: get all turns from the conversation directly
         conversation_turns = selected_conversation.get('conversation', [])
+        # Ensure conversation_turns is a list and handle different data types
+        if hasattr(conversation_turns, 'tolist'):
+            conversation_turns = conversation_turns.tolist()
+        elif conversation_turns is None:
+            conversation_turns = []
+        if len(conversation_turns) > 0:
+            # Display conversation content with metrics
+            st.subheader("💬 Conversation with Metrics")
+            # Get actual turn-level data for this conversation
+            turn_metric_columns = [f"turn.turn_metrics.{m}" for m in selected_metrics]
+            available_columns = [col for col in turn_metric_columns if col in filtered_df_exploded.columns]
+            # Get sample metrics for this conversation type (since exact matching is complex)
+            sample_metrics = None
+            if available_columns:
+                type_turns = filtered_df_exploded[filtered_df_exploded['type'] == selected_conversation['type']]
+                sample_size = min(len(conversation_turns), len(type_turns))
+                if sample_size > 0:
+                    sample_metrics = type_turns.head(sample_size)
+            # Display each turn with its metrics
+            for i, turn in enumerate(conversation_turns):
+                role = turn.get('role', 'unknown')
+                content = turn.get('content', 'No content')
+                # Display turn content with role styling
+                if role == 'user':
+                    st.markdown(f"**👤 User (Turn {i+1}):**")
+                    st.info(content)
+                elif role == 'assistant':
+                    st.markdown(f"**🤖 Assistant (Turn {i+1}):**")
+                    st.success(content)
+                else:
+                    st.markdown(f"**❓ {role.title()} (Turn {i+1}):**")
+                    st.warning(content)
+                # Display metrics for this turn
+                if sample_metrics is not None and i < len(sample_metrics):
+                    turn_row = sample_metrics.iloc[i]
+                    # Create metrics display
+                    metrics_for_turn = {}
+                    for col in available_columns:
+                        metric_name = col.replace('turn.turn_metrics.', '')
+                        friendly_name = get_human_friendly_metric_name(metric_name)
+                        value = turn_row.get(col, 'N/A')
+                        if pd.notna(value) and isinstance(value, (int, float)):
+                            metrics_for_turn[friendly_name] = round(value, 3)
+                        else:
+                            metrics_for_turn[friendly_name] = 'N/A'
+                    # Add toxicity metrics if available
+                    toxicities = turn.get('toxicities', {})
+                    if toxicities:
+                        st.markdown("**🔍 Toxicity Scores:**")
+                        tox_cols = st.columns(4)
+                        tox_metrics = [
+                            ('toxicity', 'Overall Toxicity'),
+                            ('severe_toxicity', 'Severe Toxicity'),
+                            ('identity_attack', 'Identity Attack'),
+                            ('insult', 'Insult'),
+                            ('obscene', 'Obscene'),
+                            ('sexual_explicit', 'Sexual Explicit'),
+                            ('threat', 'Threat')
+                        ]
+                        for idx, (tox_key, tox_name) in enumerate(tox_metrics):
+                            if tox_key in toxicities:
+                                col_idx = idx % 4
+                                with tox_cols[col_idx]:
+                                    tox_value = toxicities[tox_key]
+                                    if isinstance(tox_value, (int, float)):
+                                        # Color code based on toxicity level
+                                        if tox_value > 0.5:
+                                            st.metric(tox_name, f"{tox_value:.4f}", delta="HIGH", delta_color="inverse")
+                                        elif tox_value > 0.1:
+                                            st.metric(tox_name, f"{tox_value:.4f}", delta="MED", delta_color="off")
+                                        else:
+                                            st.metric(tox_name, f"{tox_value:.4f}", delta="LOW", delta_color="normal")
+                                    else:
+                                        st.metric(tox_name, str(tox_value))
+                    # Display complexity metrics
+                    if metrics_for_turn:
+                        st.markdown("**📊 Complexity Metrics:**")
+                        # Display metrics in columns
+                        num_cols = min(4, len(metrics_for_turn))
+                        if num_cols > 0:
+                            cols = st.columns(num_cols)
+                            for idx, (metric_name, value) in enumerate(metrics_for_turn.items()):
+                                col_idx = idx % num_cols
+                                with cols[col_idx]:
+                                    if isinstance(value, (int, float)) and value != 'N/A':
+                                        st.metric(metric_name, value)
                                     else:
+                                        st.metric(metric_name, str(value))
+                else:
+                    # Show toxicity even when no complexity metrics available
+                    toxicities = turn.get('toxicities', {})
+                    if toxicities:
+                        st.markdown("**🔍 Toxicity Scores:**")
+                        tox_cols = st.columns(4)
+                        tox_metrics = [
+                            ('toxicity', 'Overall Toxicity'),
+                            ('severe_toxicity', 'Severe Toxicity'),
+                            ('identity_attack', 'Identity Attack'),
+                            ('insult', 'Insult'),
+                            ('obscene', 'Obscene'),
+                            ('sexual_explicit', 'Sexual Explicit'),
+                            ('threat', 'Threat')
+                        ]
+                        for idx, (tox_key, tox_name) in enumerate(tox_metrics):
+                            if tox_key in toxicities:
+                                col_idx = idx % 4
+                                with tox_cols[col_idx]:
+                                    tox_value = toxicities[tox_key]
+                                    if isinstance(tox_value, (int, float)):
+                                        # Color code based on toxicity level
+                                        if tox_value > 0.5:
+                                            st.metric(tox_name, f"{tox_value:.4f}", delta="HIGH", delta_color="inverse")
+                                        elif tox_value > 0.1:
+                                            st.metric(tox_name, f"{tox_value:.4f}", delta="MED", delta_color="off")
+                                        else:
+                                            st.metric(tox_name, f"{tox_value:.4f}", delta="LOW", delta_color="normal")
                                     else:
+                                        st.metric(tox_name, str(tox_value))
+                    # Show basic turn statistics when no complexity metrics available
+                    st.markdown("**📈 Basic Statistics:**")
+                    col1, col2, col3 = st.columns(3)
+                    with col1:
+                        st.metric("Characters", len(content))
+                    with col2:
+                        st.metric("Words", len(content.split()))
+                    with col3:
+                        st.metric("Role", role.title())
+                # Add separator between turns
+                st.divider()
+            # Plot metrics over turns with real data if available
+            if available_columns and sample_metrics is not None:
+                st.subheader("📈 Metrics Over Turns")
+                fig = go.Figure()
+                # Add traces for each selected metric (real data)
+                for col in available_columns[:5]:  # Limit to first 5 for readability
+                    metric_name = col.replace('turn.turn_metrics.', '')
+                    friendly_name = get_human_friendly_metric_name(metric_name)
+                    # Get values for this metric
+                    y_values = []
+                    for _, turn_row in sample_metrics.iterrows():
+                        value = turn_row.get(col, None)
+                        if pd.notna(value) and isinstance(value, (int, float)):
+                            y_values.append(value)
                         else:
+                            y_values.append(None)
+                    if any(v is not None for v in y_values):
+                        fig.add_trace(go.Scatter(
+                            x=list(range(1, len(y_values) + 1)),
+                            y=y_values,
+                            mode='lines+markers',
+                            name=friendly_name,
+                            line=dict(width=2),
+                            marker=dict(size=8),
+                            connectgaps=False
+                        ))
+                if fig.data:  # Only show if we have data
+                    fig.update_layout(
+                        title="Complexity Metrics Across Conversation Turns",
+                        xaxis_title="Turn Number",
+                        yaxis_title="Metric Value",
+                        height=400,
+                        hovermode='x unified'
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                else:
+                    st.info("No numeric metric data available to plot for this conversation type.")
+            elif selected_metrics:
+                st.info("Select metrics that are available in the dataset to see turn-level analysis.")
             else:
+                st.warning("Select some metrics to see detailed turn-level analysis.")
         else:
             st.warning("No conversation data available for the selected conversation.")