Spaces:

RyanDDD
/

hhh

Sleeping

hhh / pages /step4.py

github-actions[bot]

Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)

9366995 about 2 months ago

7.91 kB

	"""Step 4: Evaluation Results page for the Streamlit app."""

	import json
	from typing import Any, Dict, List

	import numpy as np
	import pandas as pd
	import streamlit as st


	def _convert_to_json_serializable(obj: Any) -> Any:
	"""Convert numpy/torch types to JSON-serializable Python types."""
	if isinstance(obj, (np.integer, np.int32, np.int64)):
	return int(obj)
	elif isinstance(obj, (np.floating, np.float32, np.float64)):
	return float(obj)
	elif isinstance(obj, np.ndarray):
	return obj.tolist()
	elif isinstance(obj, dict):
	return {k: _convert_to_json_serializable(v) for k, v in obj.items()}
	elif isinstance(obj, (list, tuple)):
	return [_convert_to_json_serializable(item) for item in obj]
	else:
	return obj


	def _utterances_to_turns(utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
	"""Convert Step 2 parsed utterances to rubric scoring format.

	Heuristic: 'user' if speaker startswith 'user' (case-insensitive); otherwise 'assistant'.
	"""
	turns = []
	for u in utterances:
	spk = str(u.get("speaker", "")).strip().lower()
	role = "user" if spk.startswith("user") else "assistant"
	turns.append({"role": role, "content": u.get("text", "")})
	return turns


	def render_step4() -> None:
	"""Render Step 4: Evaluation Results page."""
	st.header("Step 4: Evaluation Results")
	st.markdown("View the evaluation results for your conversation.")

	if not st.session_state.get("conversation_uploaded"):
	st.warning("No conversation uploaded. Please go back to Step 2.")
	return

	utterances = st.session_state.get("utterances", [])
	selected_metrics = st.session_state.get("selected_metrics", [])

	# ===== A) Predefined evaluator results (left panel metrics via orchestrator) =====
	api_keys = {}
	if st.session_state.get("openai_configured") and st.session_state.get("openai_key"):
	api_keys["openai"] = st.session_state.openai_key
	if st.session_state.get("hf_configured") and st.session_state.get("hf_key"):
	api_keys["hf"] = st.session_state.hf_key

	if not selected_metrics:
	st.info(
	"No predefined metrics selected on Step 3 (left). Skipping orchestrator section."
	)
	else:
	st.subheader("A) Predefined Metrics (Evaluator Registry)")
	from services.orchestrator import ConversationOrchestrator

	orchestrator = ConversationOrchestrator(api_keys=api_keys)

	with st.spinner("Running evaluator registry…"):
	try:
	results = orchestrator.evaluate_conversation(
	utterances, selected_metrics=selected_metrics
	)
	st.session_state.evaluation_results = results
	except Exception as e:
	st.error(f"Evaluator run failed: {e}")
	results = []

	if results:
	st.success(f"✅ Processed {len(results)} utterances")
	# summary cards
	metric_counts: Dict[str, int] = {}
	for row in results:
	for metric_name in selected_metrics:
	scores_key = f"{metric_name}_scores"
	if scores_key in row and row[scores_key]:
	metric_counts[metric_name] = (
	metric_counts.get(metric_name, 0) + 1
	)

	if metric_counts:
	cols = st.columns(min(len(metric_counts), 4))
	for i, (metric_name, count) in enumerate(metric_counts.items()):
	from evaluators import get_metric_metadata

	md = get_metric_metadata(metric_name)
	label = md.label if md else metric_name.replace("_", " ").title()
	with cols[i % len(cols)]:
	st.metric(label, f"{count} utterances")

	# detail table
	display_data = []
	for row in results:
	display_row = {
	"Index": row["index"],
	"Speaker": row["speaker"],
	"Text": row["text"][:100]
	+ ("..." if len(row["text"]) > 100 else ""),
	}
	for metric_name in selected_metrics:
	scores_key = f"{metric_name}_scores"
	if scores_key in row and row[scores_key]:
	metric_scores = row[scores_key]
	# take one representative score
	cell = "-"
	for _, sv in metric_scores.items():
	t = sv.get("type")
	if t == "categorical":
	cell = f"{sv['label']} ({sv.get('confidence', 0):.2f})"
	elif t == "numerical":
	cell = f"{sv['value']:.2f}/{sv['max_value']}"
	break
	display_row[metric_name] = cell
	else:
	display_row[metric_name] = "-"
	display_data.append(display_row)

	df = pd.DataFrame(display_data)
	st.dataframe(df, use_container_width=True, hide_index=True)

	with st.expander("💬 Utterance-by-Utterance View"):
	for i, row in enumerate(results):
	st.markdown(f"Utterance {i+1}: {row['speaker']}")
	st.write(row["text"])
	for metric_name in selected_metrics:
	scores_key = f"{metric_name}_scores"
	if scores_key in row and row[scores_key]:
	from evaluators import get_metric_metadata

	md = get_metric_metadata(metric_name)
	label = (
	md.label
	if md
	else metric_name.replace("_", " ").title()
	)
	st.write(f"- {label}: {row[scores_key]}")

	# export
	col1, col2 = st.columns(2)
	with col1:
	# Convert results to JSON-serializable format
	serializable_results = _convert_to_json_serializable(results)
	st.download_button(
	"📥 Download evaluator JSON",
	json.dumps(serializable_results, indent=2),
	"conversation_evaluation_results.json",
	"application/json",
	use_container_width=True,
	)
	with col2:
	st.download_button(
	"📥 Download evaluator CSV",
	df.to_csv(index=False),
	"conversation_evaluation_results.csv",
	"text/csv",
	use_container_width=True,
	)

	st.divider()

	# ===== B) Custom refined metrics (right panel rubric) =====
	st.subheader("B) Custom Refined Metrics (Rubric Scoring)")
	refined_subset = st.session_state.get(
	"profile_refined_subset"
	) or st.session_state.get("refined")
	if not refined_subset:
	st.info("No refined rubric found. Go back to Step 3 Right to refine & lock.")
	return

	# Convert utterances to {role, content}
	from core.workflow import pretty_metrics_output, score_conversation

	conv_turns = _utterances_to_turns(utterances)

	try:
	with st.spinner("Scoring with custom refined metrics…"):
	rubric_result = score_conversation(
	conv_turns, refined_subset, st.session_state.get("user_prefs", {})
	)
	st.code(pretty_metrics_output(rubric_result), language="text")
	except Exception as e:
	st.error(f"Rubric scoring failed: {e}")