Spaces:

RyanDDD
/

hhh

Sleeping

File size: 7,906 Bytes
"""Step 4: Evaluation Results page for the Streamlit app."""

import json
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import streamlit as st


def _convert_to_json_serializable(obj: Any) -> Any:
    """Convert numpy/torch types to JSON-serializable Python types."""
    if isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: _convert_to_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [_convert_to_json_serializable(item) for item in obj]
    else:
        return obj


def _utterances_to_turns(utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """Convert Step 2 parsed utterances to rubric scoring format.

    Heuristic: 'user' if speaker startswith 'user' (case-insensitive); otherwise 'assistant'.
    """
    turns = []
    for u in utterances:
        spk = str(u.get("speaker", "")).strip().lower()
        role = "user" if spk.startswith("user") else "assistant"
        turns.append({"role": role, "content": u.get("text", "")})
    return turns


def render_step4() -> None:
    """Render Step 4: Evaluation Results page."""
    st.header("Step 4: Evaluation Results")
    st.markdown("View the evaluation results for your conversation.")

    if not st.session_state.get("conversation_uploaded"):
        st.warning("No conversation uploaded. Please go back to Step 2.")
        return

    utterances = st.session_state.get("utterances", [])
    selected_metrics = st.session_state.get("selected_metrics", [])

    # ===== A) Predefined evaluator results (left panel metrics via orchestrator) =====
    api_keys = {}
    if st.session_state.get("openai_configured") and st.session_state.get("openai_key"):
        api_keys["openai"] = st.session_state.openai_key
    if st.session_state.get("hf_configured") and st.session_state.get("hf_key"):
        api_keys["hf"] = st.session_state.hf_key

    if not selected_metrics:
        st.info(
            "No predefined metrics selected on Step 3 (left). Skipping orchestrator section."
        )
    else:
        st.subheader("A) Predefined Metrics (Evaluator Registry)")
        from services.orchestrator import ConversationOrchestrator

        orchestrator = ConversationOrchestrator(api_keys=api_keys)

        with st.spinner("Running evaluator registry…"):
            try:
                results = orchestrator.evaluate_conversation(
                    utterances, selected_metrics=selected_metrics
                )
                st.session_state.evaluation_results = results
            except Exception as e:
                st.error(f"Evaluator run failed: {e}")
                results = []

        if results:
            st.success(f"✅ Processed {len(results)} utterances")
            # summary cards
            metric_counts: Dict[str, int] = {}
            for row in results:
                for metric_name in selected_metrics:
                    scores_key = f"{metric_name}_scores"
                    if scores_key in row and row[scores_key]:
                        metric_counts[metric_name] = (
                            metric_counts.get(metric_name, 0) + 1
                        )

            if metric_counts:
                cols = st.columns(min(len(metric_counts), 4))
                for i, (metric_name, count) in enumerate(metric_counts.items()):
                    from evaluators import get_metric_metadata

                    md = get_metric_metadata(metric_name)
                    label = md.label if md else metric_name.replace("_", " ").title()
                    with cols[i % len(cols)]:
                        st.metric(label, f"{count} utterances")

            # detail table
            display_data = []
            for row in results:
                display_row = {
                    "Index": row["index"],
                    "Speaker": row["speaker"],
                    "Text": row["text"][:100]
                    + ("..." if len(row["text"]) > 100 else ""),
                }
                for metric_name in selected_metrics:
                    scores_key = f"{metric_name}_scores"
                    if scores_key in row and row[scores_key]:
                        metric_scores = row[scores_key]
                        # take one representative score
                        cell = "-"
                        for _, sv in metric_scores.items():
                            t = sv.get("type")
                            if t == "categorical":
                                cell = f"{sv['label']} ({sv.get('confidence', 0):.2f})"
                            elif t == "numerical":
                                cell = f"{sv['value']:.2f}/{sv['max_value']}"
                            break
                        display_row[metric_name] = cell
                    else:
                        display_row[metric_name] = "-"
                display_data.append(display_row)

            df = pd.DataFrame(display_data)
            st.dataframe(df, use_container_width=True, hide_index=True)

            with st.expander("💬 Utterance-by-Utterance View"):
                for i, row in enumerate(results):
                    st.markdown(f"**Utterance {i+1}: {row['speaker']}**")
                    st.write(row["text"])
                    for metric_name in selected_metrics:
                        scores_key = f"{metric_name}_scores"
                        if scores_key in row and row[scores_key]:
                            from evaluators import get_metric_metadata

                            md = get_metric_metadata(metric_name)
                            label = (
                                md.label
                                if md
                                else metric_name.replace("_", " ").title()
                            )
                            st.write(f"- **{label}:** {row[scores_key]}")

            # export
            col1, col2 = st.columns(2)
            with col1:
                # Convert results to JSON-serializable format
                serializable_results = _convert_to_json_serializable(results)
                st.download_button(
                    "📥 Download evaluator JSON",
                    json.dumps(serializable_results, indent=2),
                    "conversation_evaluation_results.json",
                    "application/json",
                    use_container_width=True,
                )
            with col2:
                st.download_button(
                    "📥 Download evaluator CSV",
                    df.to_csv(index=False),
                    "conversation_evaluation_results.csv",
                    "text/csv",
                    use_container_width=True,
                )

    st.divider()

    # ===== B) Custom refined metrics (right panel rubric) =====
    st.subheader("B) Custom Refined Metrics (Rubric Scoring)")
    refined_subset = st.session_state.get(
        "profile_refined_subset"
    ) or st.session_state.get("refined")
    if not refined_subset:
        st.info("No refined rubric found. Go back to Step 3 Right to refine & lock.")
        return

    # Convert utterances to {role, content}
    from core.workflow import pretty_metrics_output, score_conversation

    conv_turns = _utterances_to_turns(utterances)

    try:
        with st.spinner("Scoring with custom refined metrics…"):
            rubric_result = score_conversation(
                conv_turns, refined_subset, st.session_state.get("user_prefs", {})
            )
        st.code(pretty_metrics_output(rubric_result), language="text")
    except Exception as e:
        st.error(f"Rubric scoring failed: {e}")