File size: 7,906 Bytes
9366995 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | """Step 4: Evaluation Results page for the Streamlit app."""
import json
from typing import Any, Dict, List
import numpy as np
import pandas as pd
import streamlit as st
def _convert_to_json_serializable(obj: Any) -> Any:
"""Convert numpy/torch types to JSON-serializable Python types."""
if isinstance(obj, (np.integer, np.int32, np.int64)):
return int(obj)
elif isinstance(obj, (np.floating, np.float32, np.float64)):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, dict):
return {k: _convert_to_json_serializable(v) for k, v in obj.items()}
elif isinstance(obj, (list, tuple)):
return [_convert_to_json_serializable(item) for item in obj]
else:
return obj
def _utterances_to_turns(utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
"""Convert Step 2 parsed utterances to rubric scoring format.
Heuristic: 'user' if speaker startswith 'user' (case-insensitive); otherwise 'assistant'.
"""
turns = []
for u in utterances:
spk = str(u.get("speaker", "")).strip().lower()
role = "user" if spk.startswith("user") else "assistant"
turns.append({"role": role, "content": u.get("text", "")})
return turns
def render_step4() -> None:
"""Render Step 4: Evaluation Results page."""
st.header("Step 4: Evaluation Results")
st.markdown("View the evaluation results for your conversation.")
if not st.session_state.get("conversation_uploaded"):
st.warning("No conversation uploaded. Please go back to Step 2.")
return
utterances = st.session_state.get("utterances", [])
selected_metrics = st.session_state.get("selected_metrics", [])
# ===== A) Predefined evaluator results (left panel metrics via orchestrator) =====
api_keys = {}
if st.session_state.get("openai_configured") and st.session_state.get("openai_key"):
api_keys["openai"] = st.session_state.openai_key
if st.session_state.get("hf_configured") and st.session_state.get("hf_key"):
api_keys["hf"] = st.session_state.hf_key
if not selected_metrics:
st.info(
"No predefined metrics selected on Step 3 (left). Skipping orchestrator section."
)
else:
st.subheader("A) Predefined Metrics (Evaluator Registry)")
from services.orchestrator import ConversationOrchestrator
orchestrator = ConversationOrchestrator(api_keys=api_keys)
with st.spinner("Running evaluator registry…"):
try:
results = orchestrator.evaluate_conversation(
utterances, selected_metrics=selected_metrics
)
st.session_state.evaluation_results = results
except Exception as e:
st.error(f"Evaluator run failed: {e}")
results = []
if results:
st.success(f"✅ Processed {len(results)} utterances")
# summary cards
metric_counts: Dict[str, int] = {}
for row in results:
for metric_name in selected_metrics:
scores_key = f"{metric_name}_scores"
if scores_key in row and row[scores_key]:
metric_counts[metric_name] = (
metric_counts.get(metric_name, 0) + 1
)
if metric_counts:
cols = st.columns(min(len(metric_counts), 4))
for i, (metric_name, count) in enumerate(metric_counts.items()):
from evaluators import get_metric_metadata
md = get_metric_metadata(metric_name)
label = md.label if md else metric_name.replace("_", " ").title()
with cols[i % len(cols)]:
st.metric(label, f"{count} utterances")
# detail table
display_data = []
for row in results:
display_row = {
"Index": row["index"],
"Speaker": row["speaker"],
"Text": row["text"][:100]
+ ("..." if len(row["text"]) > 100 else ""),
}
for metric_name in selected_metrics:
scores_key = f"{metric_name}_scores"
if scores_key in row and row[scores_key]:
metric_scores = row[scores_key]
# take one representative score
cell = "-"
for _, sv in metric_scores.items():
t = sv.get("type")
if t == "categorical":
cell = f"{sv['label']} ({sv.get('confidence', 0):.2f})"
elif t == "numerical":
cell = f"{sv['value']:.2f}/{sv['max_value']}"
break
display_row[metric_name] = cell
else:
display_row[metric_name] = "-"
display_data.append(display_row)
df = pd.DataFrame(display_data)
st.dataframe(df, use_container_width=True, hide_index=True)
with st.expander("💬 Utterance-by-Utterance View"):
for i, row in enumerate(results):
st.markdown(f"**Utterance {i+1}: {row['speaker']}**")
st.write(row["text"])
for metric_name in selected_metrics:
scores_key = f"{metric_name}_scores"
if scores_key in row and row[scores_key]:
from evaluators import get_metric_metadata
md = get_metric_metadata(metric_name)
label = (
md.label
if md
else metric_name.replace("_", " ").title()
)
st.write(f"- **{label}:** {row[scores_key]}")
# export
col1, col2 = st.columns(2)
with col1:
# Convert results to JSON-serializable format
serializable_results = _convert_to_json_serializable(results)
st.download_button(
"📥 Download evaluator JSON",
json.dumps(serializable_results, indent=2),
"conversation_evaluation_results.json",
"application/json",
use_container_width=True,
)
with col2:
st.download_button(
"📥 Download evaluator CSV",
df.to_csv(index=False),
"conversation_evaluation_results.csv",
"text/csv",
use_container_width=True,
)
st.divider()
# ===== B) Custom refined metrics (right panel rubric) =====
st.subheader("B) Custom Refined Metrics (Rubric Scoring)")
refined_subset = st.session_state.get(
"profile_refined_subset"
) or st.session_state.get("refined")
if not refined_subset:
st.info("No refined rubric found. Go back to Step 3 Right to refine & lock.")
return
# Convert utterances to {role, content}
from core.workflow import pretty_metrics_output, score_conversation
conv_turns = _utterances_to_turns(utterances)
try:
with st.spinner("Scoring with custom refined metrics…"):
rubric_result = score_conversation(
conv_turns, refined_subset, st.session_state.get("user_prefs", {})
)
st.code(pretty_metrics_output(rubric_result), language="text")
except Exception as e:
st.error(f"Rubric scoring failed: {e}")
|