github-actions[bot]
Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)
9366995
| """Step 4: Evaluation Results page for the Streamlit app.""" | |
| import json | |
| from typing import Any, Dict, List | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| def _convert_to_json_serializable(obj: Any) -> Any: | |
| """Convert numpy/torch types to JSON-serializable Python types.""" | |
| if isinstance(obj, (np.integer, np.int32, np.int64)): | |
| return int(obj) | |
| elif isinstance(obj, (np.floating, np.float32, np.float64)): | |
| return float(obj) | |
| elif isinstance(obj, np.ndarray): | |
| return obj.tolist() | |
| elif isinstance(obj, dict): | |
| return {k: _convert_to_json_serializable(v) for k, v in obj.items()} | |
| elif isinstance(obj, (list, tuple)): | |
| return [_convert_to_json_serializable(item) for item in obj] | |
| else: | |
| return obj | |
| def _utterances_to_turns(utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]: | |
| """Convert Step 2 parsed utterances to rubric scoring format. | |
| Heuristic: 'user' if speaker startswith 'user' (case-insensitive); otherwise 'assistant'. | |
| """ | |
| turns = [] | |
| for u in utterances: | |
| spk = str(u.get("speaker", "")).strip().lower() | |
| role = "user" if spk.startswith("user") else "assistant" | |
| turns.append({"role": role, "content": u.get("text", "")}) | |
| return turns | |
| def render_step4() -> None: | |
| """Render Step 4: Evaluation Results page.""" | |
| st.header("Step 4: Evaluation Results") | |
| st.markdown("View the evaluation results for your conversation.") | |
| if not st.session_state.get("conversation_uploaded"): | |
| st.warning("No conversation uploaded. Please go back to Step 2.") | |
| return | |
| utterances = st.session_state.get("utterances", []) | |
| selected_metrics = st.session_state.get("selected_metrics", []) | |
| # ===== A) Predefined evaluator results (left panel metrics via orchestrator) ===== | |
| api_keys = {} | |
| if st.session_state.get("openai_configured") and st.session_state.get("openai_key"): | |
| api_keys["openai"] = st.session_state.openai_key | |
| if st.session_state.get("hf_configured") and st.session_state.get("hf_key"): | |
| api_keys["hf"] = st.session_state.hf_key | |
| if not selected_metrics: | |
| st.info( | |
| "No predefined metrics selected on Step 3 (left). Skipping orchestrator section." | |
| ) | |
| else: | |
| st.subheader("A) Predefined Metrics (Evaluator Registry)") | |
| from services.orchestrator import ConversationOrchestrator | |
| orchestrator = ConversationOrchestrator(api_keys=api_keys) | |
| with st.spinner("Running evaluator registry…"): | |
| try: | |
| results = orchestrator.evaluate_conversation( | |
| utterances, selected_metrics=selected_metrics | |
| ) | |
| st.session_state.evaluation_results = results | |
| except Exception as e: | |
| st.error(f"Evaluator run failed: {e}") | |
| results = [] | |
| if results: | |
| st.success(f"✅ Processed {len(results)} utterances") | |
| # summary cards | |
| metric_counts: Dict[str, int] = {} | |
| for row in results: | |
| for metric_name in selected_metrics: | |
| scores_key = f"{metric_name}_scores" | |
| if scores_key in row and row[scores_key]: | |
| metric_counts[metric_name] = ( | |
| metric_counts.get(metric_name, 0) + 1 | |
| ) | |
| if metric_counts: | |
| cols = st.columns(min(len(metric_counts), 4)) | |
| for i, (metric_name, count) in enumerate(metric_counts.items()): | |
| from evaluators import get_metric_metadata | |
| md = get_metric_metadata(metric_name) | |
| label = md.label if md else metric_name.replace("_", " ").title() | |
| with cols[i % len(cols)]: | |
| st.metric(label, f"{count} utterances") | |
| # detail table | |
| display_data = [] | |
| for row in results: | |
| display_row = { | |
| "Index": row["index"], | |
| "Speaker": row["speaker"], | |
| "Text": row["text"][:100] | |
| + ("..." if len(row["text"]) > 100 else ""), | |
| } | |
| for metric_name in selected_metrics: | |
| scores_key = f"{metric_name}_scores" | |
| if scores_key in row and row[scores_key]: | |
| metric_scores = row[scores_key] | |
| # take one representative score | |
| cell = "-" | |
| for _, sv in metric_scores.items(): | |
| t = sv.get("type") | |
| if t == "categorical": | |
| cell = f"{sv['label']} ({sv.get('confidence', 0):.2f})" | |
| elif t == "numerical": | |
| cell = f"{sv['value']:.2f}/{sv['max_value']}" | |
| break | |
| display_row[metric_name] = cell | |
| else: | |
| display_row[metric_name] = "-" | |
| display_data.append(display_row) | |
| df = pd.DataFrame(display_data) | |
| st.dataframe(df, use_container_width=True, hide_index=True) | |
| with st.expander("💬 Utterance-by-Utterance View"): | |
| for i, row in enumerate(results): | |
| st.markdown(f"**Utterance {i+1}: {row['speaker']}**") | |
| st.write(row["text"]) | |
| for metric_name in selected_metrics: | |
| scores_key = f"{metric_name}_scores" | |
| if scores_key in row and row[scores_key]: | |
| from evaluators import get_metric_metadata | |
| md = get_metric_metadata(metric_name) | |
| label = ( | |
| md.label | |
| if md | |
| else metric_name.replace("_", " ").title() | |
| ) | |
| st.write(f"- **{label}:** {row[scores_key]}") | |
| # export | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Convert results to JSON-serializable format | |
| serializable_results = _convert_to_json_serializable(results) | |
| st.download_button( | |
| "📥 Download evaluator JSON", | |
| json.dumps(serializable_results, indent=2), | |
| "conversation_evaluation_results.json", | |
| "application/json", | |
| use_container_width=True, | |
| ) | |
| with col2: | |
| st.download_button( | |
| "📥 Download evaluator CSV", | |
| df.to_csv(index=False), | |
| "conversation_evaluation_results.csv", | |
| "text/csv", | |
| use_container_width=True, | |
| ) | |
| st.divider() | |
| # ===== B) Custom refined metrics (right panel rubric) ===== | |
| st.subheader("B) Custom Refined Metrics (Rubric Scoring)") | |
| refined_subset = st.session_state.get( | |
| "profile_refined_subset" | |
| ) or st.session_state.get("refined") | |
| if not refined_subset: | |
| st.info("No refined rubric found. Go back to Step 3 Right to refine & lock.") | |
| return | |
| # Convert utterances to {role, content} | |
| from core.workflow import pretty_metrics_output, score_conversation | |
| conv_turns = _utterances_to_turns(utterances) | |
| try: | |
| with st.spinner("Scoring with custom refined metrics…"): | |
| rubric_result = score_conversation( | |
| conv_turns, refined_subset, st.session_state.get("user_prefs", {}) | |
| ) | |
| st.code(pretty_metrics_output(rubric_result), language="text") | |
| except Exception as e: | |
| st.error(f"Rubric scoring failed: {e}") | |