github-actions[bot]
Deploy from GitHub Actions (commit: eb2cb1538d89b3093b6b424824dd9aecfc99086b)
cff1e0e
| # web/pages/step3_right.py | |
| from dataclasses import asdict | |
| import streamlit as st | |
| if not hasattr(st, "rerun") and hasattr(st, "experimental_rerun"): | |
| st.rerun = st.experimental_rerun # type: ignore[attr-defined] | |
| from core.workflow import filter_refined_metrics # <-- NEW | |
| from core.workflow import ( | |
| BUILT_IN_EXAMPLES, | |
| available_dimensions, | |
| build_profile, | |
| default_user_prefs, | |
| extract_candidate_terms, | |
| load_definitions, | |
| lookup_definitions_for_terms, | |
| parse_conversation_text, | |
| pretty_conversation, | |
| pretty_metrics_output, | |
| pretty_refined, | |
| refine_metrics_once, | |
| sample_examples_for_dims, | |
| score_conversation, | |
| update_example_outputs, | |
| update_rubric_from_example_feedback, | |
| ) | |
| def _init_state(): | |
| if "state" not in st.session_state: | |
| st.session_state.state = "await_metrics" | |
| st.session_state.raw_metrics = "" | |
| st.session_state.refined = None | |
| st.session_state.example_convos = None | |
| st.session_state.example_outputs = None | |
| st.session_state.profile = None | |
| st.session_state.user_prefs = default_user_prefs() | |
| # NEW: which refined metrics to proceed with (filled after lock) | |
| st.session_state.allowed_refined_metric_names = [] | |
| def render_step3_right(): | |
| _init_state() | |
| st.title("🧪 Conversational Mental Metrics (Streamlit)") | |
| with st.expander("How this works"): | |
| st.markdown( | |
| "Flow: paste rough metrics → refine → approve/feedback → provide examples → approve → **choose metrics** → score conversations." | |
| ) | |
| # === UI blocks === | |
| if st.session_state.state == "await_metrics": | |
| st.subheader("1) Paste your rough metrics") | |
| raw = st.text_area( | |
| "Metrics (bullet list or text)", | |
| height=200, | |
| placeholder="- Empathy\n- Specificity\n- Safety\n- Actionability", | |
| ) | |
| # Live preview of matched definitions (optional) | |
| if raw.strip(): | |
| defs_store = load_definitions() | |
| terms = extract_candidate_terms(raw) | |
| matches = lookup_definitions_for_terms(terms, defs_store) | |
| if matches: | |
| with st.expander( | |
| "Matched definitions to include in refinement", expanded=True | |
| ): | |
| for k, v in matches.items(): | |
| st.markdown(f"- **{k}**: {v}") | |
| if st.button("Refine metrics"): | |
| st.session_state.raw_metrics = raw | |
| st.session_state.refined = refine_metrics_once(raw, feedback="") | |
| st.session_state.state = "await_metrics_approval" | |
| st.rerun() | |
| elif st.session_state.state == "await_metrics_approval": | |
| st.subheader("2) Review refined metrics") | |
| st.code(pretty_refined(st.session_state.refined), language="text") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("Approve"): | |
| st.session_state.state = "await_examples_choice" | |
| st.rerun() | |
| with col2: | |
| fb = st.text_input( | |
| "Or give feedback (will refine again). Prefix not needed." | |
| ) | |
| if st.button("Apply feedback"): | |
| st.session_state.refined = refine_metrics_once( | |
| st.session_state.raw_metrics, feedback=fb | |
| ) | |
| st.rerun() | |
| elif st.session_state.state == "await_examples_choice": | |
| st.subheader("3) Provide example conversations") | |
| use_builtin = st.checkbox("Use built-in examples", value=False) | |
| # NEW: choose dimensions for curated examples (if not using freeform JSON) | |
| dims = st.multiselect( | |
| "Pick dimensions to preview examples from", | |
| options=available_dimensions(), | |
| default=["empathy", "safety"], # tweak default if you like | |
| ) | |
| max_per_dim = st.slider("Examples per selected dimension", 1, 2, 1) | |
| raw_examples = st.text_area( | |
| "OR paste JSON (list of conversations or single conversation as list of turns)", | |
| height=200, | |
| placeholder='[{"role":"user","content":"..."}, ...]', | |
| ) | |
| if st.button("Score examples"): | |
| if use_builtin or (not raw_examples.strip() and dims): | |
| st.session_state.example_convos = sample_examples_for_dims( | |
| dims, max_per_dim=max_per_dim | |
| ) | |
| else: | |
| import json | |
| try: | |
| obj = json.loads(raw_examples) | |
| if isinstance(obj, list) and len(obj) > 0: | |
| if all(isinstance(x, list) for x in obj): | |
| st.session_state.example_convos = obj | |
| elif all( | |
| isinstance(t, dict) and "role" in t and "content" in t | |
| for t in obj | |
| ): | |
| st.session_state.example_convos = [obj] | |
| else: | |
| st.error( | |
| "Could not parse. Provide list of turns or list of conversations." | |
| ) | |
| st.stop() | |
| else: | |
| st.error("Invalid JSON.") | |
| st.stop() | |
| except Exception as e: | |
| st.error(f"JSON parse error: {e}") | |
| st.stop() | |
| outs = [] | |
| for conv in st.session_state.example_convos: | |
| mo = score_conversation( | |
| conv, st.session_state.refined, st.session_state.user_prefs | |
| ) | |
| outs.append({"conversation": conv, "metrics_output": mo}) | |
| st.session_state.example_outputs = outs | |
| st.session_state.state = "await_examples_approval" | |
| st.rerun() | |
| elif st.session_state.state == "await_examples_approval": | |
| st.subheader("4) Review example scores") | |
| metric_filter = st.multiselect( | |
| "Filter displayed metrics (optional)", | |
| options=[m.name for m in st.session_state.refined.metrics], | |
| default=[], | |
| ) | |
| for i, o in enumerate(st.session_state.example_outputs, 1): | |
| st.markdown(f"**Example {i} — Conversation**") | |
| st.code(pretty_conversation(o["conversation"]), language="text") | |
| st.markdown("**Metrics Output**") | |
| mo = o["metrics_output"] | |
| if metric_filter: | |
| # shallow filter for display only | |
| mo = { | |
| "summary": mo.get("summary", ""), | |
| "metrics": { | |
| k: v | |
| for k, v in mo.get("metrics", {}).items() | |
| if k in metric_filter | |
| }, | |
| } | |
| st.code(pretty_metrics_output(o["metrics_output"]), language="text") | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| if st.button("Approve examples and lock profile"): | |
| st.session_state.profile = build_profile( | |
| st.session_state.refined, | |
| st.session_state.example_outputs, | |
| st.session_state.user_prefs, | |
| ) | |
| # NEW: seed allowed metrics from left panel selection, intersected with refined names | |
| left_selected = set(st.session_state.get("selected_metrics", [])) | |
| refined_names = [m.name for m in st.session_state.refined.metrics] | |
| # If left panel used keys different from refined names, keep simple: take overlap by name, else fallback to all refined. | |
| overlap = [n for n in refined_names if n in left_selected] | |
| st.session_state.allowed_refined_metric_names = overlap or refined_names | |
| st.session_state.state = "ready_for_scoring" | |
| st.rerun() | |
| with c2: | |
| fb = st.text_input("Feedback to adjust rubric & outputs") | |
| if st.button("Apply feedback and rescore"): | |
| updated_outputs = update_example_outputs( | |
| st.session_state.example_outputs, fb | |
| ) | |
| new_refined, change_log = update_rubric_from_example_feedback( | |
| refined=st.session_state.refined, | |
| example_outputs=updated_outputs, | |
| feedback=fb, | |
| ) | |
| st.session_state.refined = new_refined | |
| rescored = [] | |
| for conv in [item["conversation"] for item in updated_outputs]: | |
| mo = score_conversation( | |
| conv, st.session_state.refined, st.session_state.user_prefs | |
| ) | |
| rescored.append({"conversation": conv, "metrics_output": mo}) | |
| st.session_state.example_outputs = rescored | |
| if change_log: | |
| st.info("Change log:\n- " + "\n- ".join(change_log)) | |
| st.rerun() | |
| elif st.session_state.state == "ready_for_scoring": | |
| st.subheader("5) Choose metrics to proceed & score any conversation") | |
| # NEW: let the user choose which refined metrics are active | |
| all_refined_names = [m.name for m in st.session_state.refined.metrics] | |
| current_allowed = st.session_state.get( | |
| "allowed_refined_metric_names", all_refined_names | |
| ) | |
| chosen = st.multiselect( | |
| "Select which refined metrics to use for scoring", | |
| options=all_refined_names, | |
| default=current_allowed, | |
| ) | |
| st.session_state.allowed_refined_metric_names = chosen or all_refined_names | |
| # (Optional) let the user also pick example *dimensions* to reuse curated examples later if they want | |
| with st.expander( | |
| "(Optional) Choose example dimensions to preview more examples" | |
| ): | |
| dims = st.multiselect( | |
| "Dimensions", | |
| options=available_dimensions(), | |
| default=["empathy", "safety"], | |
| ) | |
| st.caption( | |
| "This only affects example previews, not the scoring rubric. (Use the selector above to control scoring metrics.)" | |
| ) | |
| sample = "User: ...\nAssistant: ...\nUser: ...\nAssistant: ..." | |
| conv_txt = st.text_area( | |
| "Paste JSON turns or simple transcript", height=220, placeholder=sample | |
| ) | |
| if st.button("Score conversation"): | |
| conv = parse_conversation_text(conv_txt) | |
| if not conv: | |
| st.error("Could not parse conversation.") | |
| else: | |
| # NEW: score with filtered refined metrics | |
| filtered = filter_refined_metrics( | |
| st.session_state.refined, | |
| st.session_state.allowed_refined_metric_names, | |
| ) | |
| result = score_conversation(conv, filtered, st.session_state.user_prefs) | |
| st.code(pretty_metrics_output(result), language="text") | |
| if st.button("Reset workflow"): | |
| for k in list(st.session_state.keys()): | |
| del st.session_state[k] | |
| st.rerun() | |
| # reset button unchanged | |