"""Step 4: Evaluation Results page for the Streamlit app.""" import json from typing import Any, Dict, List import numpy as np import pandas as pd import streamlit as st def _convert_to_json_serializable(obj: Any) -> Any: """Convert numpy/torch types to JSON-serializable Python types.""" if isinstance(obj, (np.integer, np.int32, np.int64)): return int(obj) elif isinstance(obj, (np.floating, np.float32, np.float64)): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, dict): return {k: _convert_to_json_serializable(v) for k, v in obj.items()} elif isinstance(obj, (list, tuple)): return [_convert_to_json_serializable(item) for item in obj] else: return obj def _utterances_to_turns(utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]: """Convert Step 2 parsed utterances to rubric scoring format. Heuristic: 'user' if speaker startswith 'user' (case-insensitive); otherwise 'assistant'. """ turns = [] for u in utterances: spk = str(u.get("speaker", "")).strip().lower() role = "user" if spk.startswith("user") else "assistant" turns.append({"role": role, "content": u.get("text", "")}) return turns def render_step4() -> None: """Render Step 4: Evaluation Results page.""" st.header("Step 4: Evaluation Results") st.markdown("View the evaluation results for your conversation.") if not st.session_state.get("conversation_uploaded"): st.warning("No conversation uploaded. Please go back to Step 2.") return utterances = st.session_state.get("utterances", []) selected_metrics = st.session_state.get("selected_metrics", []) # ===== A) Predefined evaluator results (left panel metrics via orchestrator) ===== api_keys = {} if st.session_state.get("openai_configured") and st.session_state.get("openai_key"): api_keys["openai"] = st.session_state.openai_key if st.session_state.get("hf_configured") and st.session_state.get("hf_key"): api_keys["hf"] = st.session_state.hf_key if not selected_metrics: st.info( "No predefined metrics selected on Step 3 (left). Skipping orchestrator section." ) else: st.subheader("A) Predefined Metrics (Evaluator Registry)") from services.orchestrator import ConversationOrchestrator orchestrator = ConversationOrchestrator(api_keys=api_keys) with st.spinner("Running evaluator registry…"): try: results = orchestrator.evaluate_conversation( utterances, selected_metrics=selected_metrics ) st.session_state.evaluation_results = results except Exception as e: st.error(f"Evaluator run failed: {e}") results = [] if results: st.success(f"✅ Processed {len(results)} utterances") # summary cards metric_counts: Dict[str, int] = {} for row in results: for metric_name in selected_metrics: scores_key = f"{metric_name}_scores" if scores_key in row and row[scores_key]: metric_counts[metric_name] = ( metric_counts.get(metric_name, 0) + 1 ) if metric_counts: cols = st.columns(min(len(metric_counts), 4)) for i, (metric_name, count) in enumerate(metric_counts.items()): from evaluators import get_metric_metadata md = get_metric_metadata(metric_name) label = md.label if md else metric_name.replace("_", " ").title() with cols[i % len(cols)]: st.metric(label, f"{count} utterances") # detail table display_data = [] for row in results: display_row = { "Index": row["index"], "Speaker": row["speaker"], "Text": row["text"][:100] + ("..." if len(row["text"]) > 100 else ""), } for metric_name in selected_metrics: scores_key = f"{metric_name}_scores" if scores_key in row and row[scores_key]: metric_scores = row[scores_key] # take one representative score cell = "-" for _, sv in metric_scores.items(): t = sv.get("type") if t == "categorical": cell = f"{sv['label']} ({sv.get('confidence', 0):.2f})" elif t == "numerical": cell = f"{sv['value']:.2f}/{sv['max_value']}" break display_row[metric_name] = cell else: display_row[metric_name] = "-" display_data.append(display_row) df = pd.DataFrame(display_data) st.dataframe(df, use_container_width=True, hide_index=True) with st.expander("💬 Utterance-by-Utterance View"): for i, row in enumerate(results): st.markdown(f"**Utterance {i+1}: {row['speaker']}**") st.write(row["text"]) for metric_name in selected_metrics: scores_key = f"{metric_name}_scores" if scores_key in row and row[scores_key]: from evaluators import get_metric_metadata md = get_metric_metadata(metric_name) label = ( md.label if md else metric_name.replace("_", " ").title() ) st.write(f"- **{label}:** {row[scores_key]}") # export col1, col2 = st.columns(2) with col1: # Convert results to JSON-serializable format serializable_results = _convert_to_json_serializable(results) st.download_button( "📥 Download evaluator JSON", json.dumps(serializable_results, indent=2), "conversation_evaluation_results.json", "application/json", use_container_width=True, ) with col2: st.download_button( "📥 Download evaluator CSV", df.to_csv(index=False), "conversation_evaluation_results.csv", "text/csv", use_container_width=True, ) st.divider() # ===== B) Custom refined metrics (right panel rubric) ===== st.subheader("B) Custom Refined Metrics (Rubric Scoring)") refined_subset = st.session_state.get( "profile_refined_subset" ) or st.session_state.get("refined") if not refined_subset: st.info("No refined rubric found. Go back to Step 3 Right to refine & lock.") return # Convert utterances to {role, content} from core.workflow import pretty_metrics_output, score_conversation conv_turns = _utterances_to_turns(utterances) try: with st.spinner("Scoring with custom refined metrics…"): rubric_result = score_conversation( conv_turns, refined_subset, st.session_state.get("user_prefs", {}) ) st.code(pretty_metrics_output(rubric_result), language="text") except Exception as e: st.error(f"Rubric scoring failed: {e}")