File size: 7,906 Bytes
9366995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""Step 4: Evaluation Results page for the Streamlit app."""

import json
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import streamlit as st


def _convert_to_json_serializable(obj: Any) -> Any:
    """Convert numpy/torch types to JSON-serializable Python types."""
    if isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: _convert_to_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [_convert_to_json_serializable(item) for item in obj]
    else:
        return obj


def _utterances_to_turns(utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """Convert Step 2 parsed utterances to rubric scoring format.

    Heuristic: 'user' if speaker startswith 'user' (case-insensitive); otherwise 'assistant'.
    """
    turns = []
    for u in utterances:
        spk = str(u.get("speaker", "")).strip().lower()
        role = "user" if spk.startswith("user") else "assistant"
        turns.append({"role": role, "content": u.get("text", "")})
    return turns


def render_step4() -> None:
    """Render Step 4: Evaluation Results page."""
    st.header("Step 4: Evaluation Results")
    st.markdown("View the evaluation results for your conversation.")

    if not st.session_state.get("conversation_uploaded"):
        st.warning("No conversation uploaded. Please go back to Step 2.")
        return

    utterances = st.session_state.get("utterances", [])
    selected_metrics = st.session_state.get("selected_metrics", [])

    # ===== A) Predefined evaluator results (left panel metrics via orchestrator) =====
    api_keys = {}
    if st.session_state.get("openai_configured") and st.session_state.get("openai_key"):
        api_keys["openai"] = st.session_state.openai_key
    if st.session_state.get("hf_configured") and st.session_state.get("hf_key"):
        api_keys["hf"] = st.session_state.hf_key

    if not selected_metrics:
        st.info(
            "No predefined metrics selected on Step 3 (left). Skipping orchestrator section."
        )
    else:
        st.subheader("A) Predefined Metrics (Evaluator Registry)")
        from services.orchestrator import ConversationOrchestrator

        orchestrator = ConversationOrchestrator(api_keys=api_keys)

        with st.spinner("Running evaluator registry…"):
            try:
                results = orchestrator.evaluate_conversation(
                    utterances, selected_metrics=selected_metrics
                )
                st.session_state.evaluation_results = results
            except Exception as e:
                st.error(f"Evaluator run failed: {e}")
                results = []

        if results:
            st.success(f"✅ Processed {len(results)} utterances")
            # summary cards
            metric_counts: Dict[str, int] = {}
            for row in results:
                for metric_name in selected_metrics:
                    scores_key = f"{metric_name}_scores"
                    if scores_key in row and row[scores_key]:
                        metric_counts[metric_name] = (
                            metric_counts.get(metric_name, 0) + 1
                        )

            if metric_counts:
                cols = st.columns(min(len(metric_counts), 4))
                for i, (metric_name, count) in enumerate(metric_counts.items()):
                    from evaluators import get_metric_metadata

                    md = get_metric_metadata(metric_name)
                    label = md.label if md else metric_name.replace("_", " ").title()
                    with cols[i % len(cols)]:
                        st.metric(label, f"{count} utterances")

            # detail table
            display_data = []
            for row in results:
                display_row = {
                    "Index": row["index"],
                    "Speaker": row["speaker"],
                    "Text": row["text"][:100]
                    + ("..." if len(row["text"]) > 100 else ""),
                }
                for metric_name in selected_metrics:
                    scores_key = f"{metric_name}_scores"
                    if scores_key in row and row[scores_key]:
                        metric_scores = row[scores_key]
                        # take one representative score
                        cell = "-"
                        for _, sv in metric_scores.items():
                            t = sv.get("type")
                            if t == "categorical":
                                cell = f"{sv['label']} ({sv.get('confidence', 0):.2f})"
                            elif t == "numerical":
                                cell = f"{sv['value']:.2f}/{sv['max_value']}"
                            break
                        display_row[metric_name] = cell
                    else:
                        display_row[metric_name] = "-"
                display_data.append(display_row)

            df = pd.DataFrame(display_data)
            st.dataframe(df, use_container_width=True, hide_index=True)

            with st.expander("💬 Utterance-by-Utterance View"):
                for i, row in enumerate(results):
                    st.markdown(f"**Utterance {i+1}: {row['speaker']}**")
                    st.write(row["text"])
                    for metric_name in selected_metrics:
                        scores_key = f"{metric_name}_scores"
                        if scores_key in row and row[scores_key]:
                            from evaluators import get_metric_metadata

                            md = get_metric_metadata(metric_name)
                            label = (
                                md.label
                                if md
                                else metric_name.replace("_", " ").title()
                            )
                            st.write(f"- **{label}:** {row[scores_key]}")

            # export
            col1, col2 = st.columns(2)
            with col1:
                # Convert results to JSON-serializable format
                serializable_results = _convert_to_json_serializable(results)
                st.download_button(
                    "📥 Download evaluator JSON",
                    json.dumps(serializable_results, indent=2),
                    "conversation_evaluation_results.json",
                    "application/json",
                    use_container_width=True,
                )
            with col2:
                st.download_button(
                    "📥 Download evaluator CSV",
                    df.to_csv(index=False),
                    "conversation_evaluation_results.csv",
                    "text/csv",
                    use_container_width=True,
                )

    st.divider()

    # ===== B) Custom refined metrics (right panel rubric) =====
    st.subheader("B) Custom Refined Metrics (Rubric Scoring)")
    refined_subset = st.session_state.get(
        "profile_refined_subset"
    ) or st.session_state.get("refined")
    if not refined_subset:
        st.info("No refined rubric found. Go back to Step 3 Right to refine & lock.")
        return

    # Convert utterances to {role, content}
    from core.workflow import pretty_metrics_output, score_conversation

    conv_turns = _utterances_to_turns(utterances)

    try:
        with st.spinner("Scoring with custom refined metrics…"):
            rubric_result = score_conversation(
                conv_turns, refined_subset, st.session_state.get("user_prefs", {})
            )
        st.code(pretty_metrics_output(rubric_result), language="text")
    except Exception as e:
        st.error(f"Rubric scoring failed: {e}")