hhh / pages /step4.py
github-actions[bot]
Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)
9366995
"""Step 4: Evaluation Results page for the Streamlit app."""
import json
from typing import Any, Dict, List
import numpy as np
import pandas as pd
import streamlit as st
def _convert_to_json_serializable(obj: Any) -> Any:
"""Convert numpy/torch types to JSON-serializable Python types."""
if isinstance(obj, (np.integer, np.int32, np.int64)):
return int(obj)
elif isinstance(obj, (np.floating, np.float32, np.float64)):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, dict):
return {k: _convert_to_json_serializable(v) for k, v in obj.items()}
elif isinstance(obj, (list, tuple)):
return [_convert_to_json_serializable(item) for item in obj]
else:
return obj
def _utterances_to_turns(utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
"""Convert Step 2 parsed utterances to rubric scoring format.
Heuristic: 'user' if speaker startswith 'user' (case-insensitive); otherwise 'assistant'.
"""
turns = []
for u in utterances:
spk = str(u.get("speaker", "")).strip().lower()
role = "user" if spk.startswith("user") else "assistant"
turns.append({"role": role, "content": u.get("text", "")})
return turns
def render_step4() -> None:
"""Render Step 4: Evaluation Results page."""
st.header("Step 4: Evaluation Results")
st.markdown("View the evaluation results for your conversation.")
if not st.session_state.get("conversation_uploaded"):
st.warning("No conversation uploaded. Please go back to Step 2.")
return
utterances = st.session_state.get("utterances", [])
selected_metrics = st.session_state.get("selected_metrics", [])
# ===== A) Predefined evaluator results (left panel metrics via orchestrator) =====
api_keys = {}
if st.session_state.get("openai_configured") and st.session_state.get("openai_key"):
api_keys["openai"] = st.session_state.openai_key
if st.session_state.get("hf_configured") and st.session_state.get("hf_key"):
api_keys["hf"] = st.session_state.hf_key
if not selected_metrics:
st.info(
"No predefined metrics selected on Step 3 (left). Skipping orchestrator section."
)
else:
st.subheader("A) Predefined Metrics (Evaluator Registry)")
from services.orchestrator import ConversationOrchestrator
orchestrator = ConversationOrchestrator(api_keys=api_keys)
with st.spinner("Running evaluator registry…"):
try:
results = orchestrator.evaluate_conversation(
utterances, selected_metrics=selected_metrics
)
st.session_state.evaluation_results = results
except Exception as e:
st.error(f"Evaluator run failed: {e}")
results = []
if results:
st.success(f"✅ Processed {len(results)} utterances")
# summary cards
metric_counts: Dict[str, int] = {}
for row in results:
for metric_name in selected_metrics:
scores_key = f"{metric_name}_scores"
if scores_key in row and row[scores_key]:
metric_counts[metric_name] = (
metric_counts.get(metric_name, 0) + 1
)
if metric_counts:
cols = st.columns(min(len(metric_counts), 4))
for i, (metric_name, count) in enumerate(metric_counts.items()):
from evaluators import get_metric_metadata
md = get_metric_metadata(metric_name)
label = md.label if md else metric_name.replace("_", " ").title()
with cols[i % len(cols)]:
st.metric(label, f"{count} utterances")
# detail table
display_data = []
for row in results:
display_row = {
"Index": row["index"],
"Speaker": row["speaker"],
"Text": row["text"][:100]
+ ("..." if len(row["text"]) > 100 else ""),
}
for metric_name in selected_metrics:
scores_key = f"{metric_name}_scores"
if scores_key in row and row[scores_key]:
metric_scores = row[scores_key]
# take one representative score
cell = "-"
for _, sv in metric_scores.items():
t = sv.get("type")
if t == "categorical":
cell = f"{sv['label']} ({sv.get('confidence', 0):.2f})"
elif t == "numerical":
cell = f"{sv['value']:.2f}/{sv['max_value']}"
break
display_row[metric_name] = cell
else:
display_row[metric_name] = "-"
display_data.append(display_row)
df = pd.DataFrame(display_data)
st.dataframe(df, use_container_width=True, hide_index=True)
with st.expander("💬 Utterance-by-Utterance View"):
for i, row in enumerate(results):
st.markdown(f"**Utterance {i+1}: {row['speaker']}**")
st.write(row["text"])
for metric_name in selected_metrics:
scores_key = f"{metric_name}_scores"
if scores_key in row and row[scores_key]:
from evaluators import get_metric_metadata
md = get_metric_metadata(metric_name)
label = (
md.label
if md
else metric_name.replace("_", " ").title()
)
st.write(f"- **{label}:** {row[scores_key]}")
# export
col1, col2 = st.columns(2)
with col1:
# Convert results to JSON-serializable format
serializable_results = _convert_to_json_serializable(results)
st.download_button(
"📥 Download evaluator JSON",
json.dumps(serializable_results, indent=2),
"conversation_evaluation_results.json",
"application/json",
use_container_width=True,
)
with col2:
st.download_button(
"📥 Download evaluator CSV",
df.to_csv(index=False),
"conversation_evaluation_results.csv",
"text/csv",
use_container_width=True,
)
st.divider()
# ===== B) Custom refined metrics (right panel rubric) =====
st.subheader("B) Custom Refined Metrics (Rubric Scoring)")
refined_subset = st.session_state.get(
"profile_refined_subset"
) or st.session_state.get("refined")
if not refined_subset:
st.info("No refined rubric found. Go back to Step 3 Right to refine & lock.")
return
# Convert utterances to {role, content}
from core.workflow import pretty_metrics_output, score_conversation
conv_turns = _utterances_to_turns(utterances)
try:
with st.spinner("Scoring with custom refined metrics…"):
rubric_result = score_conversation(
conv_turns, refined_subset, st.session_state.get("user_prefs", {})
)
st.code(pretty_metrics_output(rubric_result), language="text")
except Exception as e:
st.error(f"Rubric scoring failed: {e}")