hhh-test / pages /step3_right.py
github-actions[bot]
Deploy from GitHub Actions (commit: eb2cb1538d89b3093b6b424824dd9aecfc99086b)
cff1e0e
# web/pages/step3_right.py
from dataclasses import asdict
import streamlit as st
if not hasattr(st, "rerun") and hasattr(st, "experimental_rerun"):
st.rerun = st.experimental_rerun # type: ignore[attr-defined]
from core.workflow import filter_refined_metrics # <-- NEW
from core.workflow import (
BUILT_IN_EXAMPLES,
available_dimensions,
build_profile,
default_user_prefs,
extract_candidate_terms,
load_definitions,
lookup_definitions_for_terms,
parse_conversation_text,
pretty_conversation,
pretty_metrics_output,
pretty_refined,
refine_metrics_once,
sample_examples_for_dims,
score_conversation,
update_example_outputs,
update_rubric_from_example_feedback,
)
def _init_state():
if "state" not in st.session_state:
st.session_state.state = "await_metrics"
st.session_state.raw_metrics = ""
st.session_state.refined = None
st.session_state.example_convos = None
st.session_state.example_outputs = None
st.session_state.profile = None
st.session_state.user_prefs = default_user_prefs()
# NEW: which refined metrics to proceed with (filled after lock)
st.session_state.allowed_refined_metric_names = []
def render_step3_right():
_init_state()
st.title("🧪 Conversational Mental Metrics (Streamlit)")
with st.expander("How this works"):
st.markdown(
"Flow: paste rough metrics → refine → approve/feedback → provide examples → approve → **choose metrics** → score conversations."
)
# === UI blocks ===
if st.session_state.state == "await_metrics":
st.subheader("1) Paste your rough metrics")
raw = st.text_area(
"Metrics (bullet list or text)",
height=200,
placeholder="- Empathy\n- Specificity\n- Safety\n- Actionability",
)
# Live preview of matched definitions (optional)
if raw.strip():
defs_store = load_definitions()
terms = extract_candidate_terms(raw)
matches = lookup_definitions_for_terms(terms, defs_store)
if matches:
with st.expander(
"Matched definitions to include in refinement", expanded=True
):
for k, v in matches.items():
st.markdown(f"- **{k}**: {v}")
if st.button("Refine metrics"):
st.session_state.raw_metrics = raw
st.session_state.refined = refine_metrics_once(raw, feedback="")
st.session_state.state = "await_metrics_approval"
st.rerun()
elif st.session_state.state == "await_metrics_approval":
st.subheader("2) Review refined metrics")
st.code(pretty_refined(st.session_state.refined), language="text")
col1, col2 = st.columns(2)
with col1:
if st.button("Approve"):
st.session_state.state = "await_examples_choice"
st.rerun()
with col2:
fb = st.text_input(
"Or give feedback (will refine again). Prefix not needed."
)
if st.button("Apply feedback"):
st.session_state.refined = refine_metrics_once(
st.session_state.raw_metrics, feedback=fb
)
st.rerun()
elif st.session_state.state == "await_examples_choice":
st.subheader("3) Provide example conversations")
use_builtin = st.checkbox("Use built-in examples", value=False)
# NEW: choose dimensions for curated examples (if not using freeform JSON)
dims = st.multiselect(
"Pick dimensions to preview examples from",
options=available_dimensions(),
default=["empathy", "safety"], # tweak default if you like
)
max_per_dim = st.slider("Examples per selected dimension", 1, 2, 1)
raw_examples = st.text_area(
"OR paste JSON (list of conversations or single conversation as list of turns)",
height=200,
placeholder='[{"role":"user","content":"..."}, ...]',
)
if st.button("Score examples"):
if use_builtin or (not raw_examples.strip() and dims):
st.session_state.example_convos = sample_examples_for_dims(
dims, max_per_dim=max_per_dim
)
else:
import json
try:
obj = json.loads(raw_examples)
if isinstance(obj, list) and len(obj) > 0:
if all(isinstance(x, list) for x in obj):
st.session_state.example_convos = obj
elif all(
isinstance(t, dict) and "role" in t and "content" in t
for t in obj
):
st.session_state.example_convos = [obj]
else:
st.error(
"Could not parse. Provide list of turns or list of conversations."
)
st.stop()
else:
st.error("Invalid JSON.")
st.stop()
except Exception as e:
st.error(f"JSON parse error: {e}")
st.stop()
outs = []
for conv in st.session_state.example_convos:
mo = score_conversation(
conv, st.session_state.refined, st.session_state.user_prefs
)
outs.append({"conversation": conv, "metrics_output": mo})
st.session_state.example_outputs = outs
st.session_state.state = "await_examples_approval"
st.rerun()
elif st.session_state.state == "await_examples_approval":
st.subheader("4) Review example scores")
metric_filter = st.multiselect(
"Filter displayed metrics (optional)",
options=[m.name for m in st.session_state.refined.metrics],
default=[],
)
for i, o in enumerate(st.session_state.example_outputs, 1):
st.markdown(f"**Example {i} — Conversation**")
st.code(pretty_conversation(o["conversation"]), language="text")
st.markdown("**Metrics Output**")
mo = o["metrics_output"]
if metric_filter:
# shallow filter for display only
mo = {
"summary": mo.get("summary", ""),
"metrics": {
k: v
for k, v in mo.get("metrics", {}).items()
if k in metric_filter
},
}
st.code(pretty_metrics_output(o["metrics_output"]), language="text")
c1, c2 = st.columns(2)
with c1:
if st.button("Approve examples and lock profile"):
st.session_state.profile = build_profile(
st.session_state.refined,
st.session_state.example_outputs,
st.session_state.user_prefs,
)
# NEW: seed allowed metrics from left panel selection, intersected with refined names
left_selected = set(st.session_state.get("selected_metrics", []))
refined_names = [m.name for m in st.session_state.refined.metrics]
# If left panel used keys different from refined names, keep simple: take overlap by name, else fallback to all refined.
overlap = [n for n in refined_names if n in left_selected]
st.session_state.allowed_refined_metric_names = overlap or refined_names
st.session_state.state = "ready_for_scoring"
st.rerun()
with c2:
fb = st.text_input("Feedback to adjust rubric & outputs")
if st.button("Apply feedback and rescore"):
updated_outputs = update_example_outputs(
st.session_state.example_outputs, fb
)
new_refined, change_log = update_rubric_from_example_feedback(
refined=st.session_state.refined,
example_outputs=updated_outputs,
feedback=fb,
)
st.session_state.refined = new_refined
rescored = []
for conv in [item["conversation"] for item in updated_outputs]:
mo = score_conversation(
conv, st.session_state.refined, st.session_state.user_prefs
)
rescored.append({"conversation": conv, "metrics_output": mo})
st.session_state.example_outputs = rescored
if change_log:
st.info("Change log:\n- " + "\n- ".join(change_log))
st.rerun()
elif st.session_state.state == "ready_for_scoring":
st.subheader("5) Choose metrics to proceed & score any conversation")
# NEW: let the user choose which refined metrics are active
all_refined_names = [m.name for m in st.session_state.refined.metrics]
current_allowed = st.session_state.get(
"allowed_refined_metric_names", all_refined_names
)
chosen = st.multiselect(
"Select which refined metrics to use for scoring",
options=all_refined_names,
default=current_allowed,
)
st.session_state.allowed_refined_metric_names = chosen or all_refined_names
# (Optional) let the user also pick example *dimensions* to reuse curated examples later if they want
with st.expander(
"(Optional) Choose example dimensions to preview more examples"
):
dims = st.multiselect(
"Dimensions",
options=available_dimensions(),
default=["empathy", "safety"],
)
st.caption(
"This only affects example previews, not the scoring rubric. (Use the selector above to control scoring metrics.)"
)
sample = "User: ...\nAssistant: ...\nUser: ...\nAssistant: ..."
conv_txt = st.text_area(
"Paste JSON turns or simple transcript", height=220, placeholder=sample
)
if st.button("Score conversation"):
conv = parse_conversation_text(conv_txt)
if not conv:
st.error("Could not parse conversation.")
else:
# NEW: score with filtered refined metrics
filtered = filter_refined_metrics(
st.session_state.refined,
st.session_state.allowed_refined_metric_names,
)
result = score_conversation(conv, filtered, st.session_state.user_prefs)
st.code(pretty_metrics_output(result), language="text")
if st.button("Reset workflow"):
for k in list(st.session_state.keys()):
del st.session_state[k]
st.rerun()
# reset button unchanged