LLMEvaluation / app.py
Danielfonseca1212's picture
Create app.py
d92f0a9 verified
# app.py β€” LLM Evaluation Dashboard | G-Eval Style | Daniel Fonseca
import streamlit as st
import os
st.set_page_config(
page_title="LLM Eval Β· Daniel Fonseca",
page_icon="πŸ”¬",
layout="wide",
initial_sidebar_state="expanded",
)
# ── CSS: LABORATΓ“RIO CIENTÍFICO ───────────────────────────────
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=DM+Mono:ital,wght@0,400;0,500;1,400&family=Playfair+Display:wght@400;700;900&family=DM+Sans:wght@300;400;500&display=swap');
:root {
--bg: #f5f4f0;
--bg2: #eeecea;
--bg3: #e6e4e0;
--ink: #1a1814;
--ink2: #4a4840;
--ink3: #8a8880;
--border: #d8d6d0;
--border2: #c8c6c0;
--green: #1a7a4a;
--green-bg: #e8f5ee;
--amber: #a06010;
--amber-bg: #fef3dc;
--red: #b02020;
--red-bg: #fdecea;
--blue: #1a4a8a;
--blue-bg: #e8eef8;
--accent: #2d5a9e;
}
html, body, [class*="css"] {
background: var(--bg) !important;
color: var(--ink) !important;
font-family: 'DM Sans', sans-serif;
}
#MainMenu, footer, header { visibility: hidden; }
.block-container { padding-top: 1.2rem; max-width: 1280px; }
/* ── HEADER ── */
.eval-header {
border-bottom: 2px solid var(--ink);
padding-bottom: 1rem;
margin-bottom: 1.5rem;
display: flex;
align-items: flex-end;
justify-content: space-between;
}
.eval-title {
font-family: 'Playfair Display', Georgia, serif;
font-weight: 900;
font-size: 2.2rem;
color: var(--ink);
letter-spacing: -0.02em;
line-height: 1;
}
.eval-subtitle {
font-family: 'DM Mono', monospace;
font-size: 0.65rem;
color: var(--ink3);
letter-spacing: 0.18em;
text-transform: uppercase;
margin-top: 0.3rem;
}
.version-tag {
font-family: 'DM Mono', monospace;
font-size: 0.65rem;
color: var(--ink3);
border: 1px solid var(--border2);
padding: 0.2rem 0.6rem;
border-radius: 2px;
letter-spacing: 0.1em;
}
/* ── GAUGE / SCORE RING ── */
.gauge-container {
text-align: center;
position: relative;
}
.gauge-ring-svg { display: block; margin: 0 auto; }
.gauge-value {
font-family: 'Playfair Display', serif;
font-weight: 900;
font-size: 2.4rem;
color: var(--ink);
line-height: 1;
}
.gauge-label {
font-family: 'DM Mono', monospace;
font-size: 0.6rem;
color: var(--ink3);
letter-spacing: 0.15em;
text-transform: uppercase;
margin-top: 0.2rem;
}
/* ── DIMENSION CARD ── */
.dim-card {
background: white;
border: 1px solid var(--border);
border-radius: 4px;
padding: 1rem 1.2rem;
margin-bottom: 0.6rem;
position: relative;
overflow: hidden;
}
.dim-card::before {
content: '';
position: absolute;
left: 0; top: 0; bottom: 0;
width: 3px;
}
.dim-excellent::before { background: var(--green); }
.dim-good::before { background: #5aaa70; }
.dim-fair::before { background: var(--amber); }
.dim-poor::before { background: var(--red); }
.dim-header {
display: flex;
justify-content: space-between;
align-items: flex-start;
margin-bottom: 0.5rem;
}
.dim-name {
font-family: 'DM Mono', monospace;
font-size: 0.72rem;
font-weight: 500;
color: var(--ink2);
text-transform: uppercase;
letter-spacing: 0.1em;
}
.dim-desc {
font-size: 0.8rem;
color: var(--ink3);
margin-top: 0.1rem;
}
.score-chip {
font-family: 'Playfair Display', serif;
font-weight: 700;
font-size: 1.3rem;
line-height: 1;
}
.score-excellent { color: var(--green); }
.score-good { color: #3a8a50; }
.score-fair { color: var(--amber); }
.score-poor { color: var(--red); }
/* Progress bar */
.prog-track {
background: var(--bg3);
border-radius: 2px;
height: 5px;
width: 100%;
margin: 0.5rem 0;
overflow: hidden;
}
.prog-fill-excellent { height: 100%; background: var(--green); border-radius: 2px; transition: width 0.6s ease; }
.prog-fill-good { height: 100%; background: #3a8a50; border-radius: 2px; transition: width 0.6s ease; }
.prog-fill-fair { height: 100%; background: var(--amber); border-radius: 2px; transition: width 0.6s ease; }
.prog-fill-poor { height: 100%; background: var(--red); border-radius: 2px; transition: width 0.6s ease; }
.dim-reasoning {
font-size: 0.85rem;
color: var(--ink2);
line-height: 1.6;
font-style: italic;
margin-top: 0.4rem;
}
.issue-list {
margin-top: 0.4rem;
padding: 0;
list-style: none;
}
.issue-item {
font-family: 'DM Mono', monospace;
font-size: 0.72rem;
color: var(--red);
padding: 0.15rem 0;
display: flex;
align-items: flex-start;
gap: 0.4rem;
}
.issue-item::before { content: '↳'; color: var(--border2); }
/* ── VERDICT BANNER ── */
.verdict-banner {
border: 1px solid var(--border);
border-radius: 4px;
padding: 1.2rem 1.5rem;
margin-bottom: 1rem;
display: flex;
align-items: center;
gap: 1.5rem;
}
.verdict-excellent { background: var(--green-bg); border-color: var(--green); }
.verdict-good { background: #eef8f2; border-color: #3a8a50; }
.verdict-fair { background: var(--amber-bg); border-color: var(--amber); }
.verdict-poor { background: var(--red-bg); border-color: var(--red); }
.verdict-label {
font-family: 'Playfair Display', serif;
font-weight: 900;
font-size: 1.8rem;
letter-spacing: -0.02em;
}
.verdict-excellent .verdict-label { color: var(--green); }
.verdict-good .verdict-label { color: #3a8a50; }
.verdict-fair .verdict-label { color: var(--amber); }
.verdict-poor .verdict-label { color: var(--red); }
.verdict-summary {
font-size: 0.9rem;
color: var(--ink2);
line-height: 1.6;
}
/* ── COMPARISON TABLE ── */
.compare-row {
display: grid;
grid-template-columns: 160px 1fr 1fr;
gap: 0;
border-bottom: 1px solid var(--border);
padding: 0.5rem 0;
align-items: center;
}
.compare-row:first-child {
border-top: 2px solid var(--ink);
font-family: 'DM Mono', monospace;
font-size: 0.65rem;
text-transform: uppercase;
letter-spacing: 0.1em;
color: var(--ink3);
padding-top: 0.4rem;
}
.compare-dim {
font-family: 'DM Mono', monospace;
font-size: 0.72rem;
color: var(--ink3);
text-transform: uppercase;
letter-spacing: 0.05em;
}
.compare-score {
font-family: 'Playfair Display', serif;
font-weight: 700;
font-size: 1.1rem;
text-align: center;
}
.winner-cell {
background: #fffbe6;
border-radius: 2px;
}
/* ── INPUT FORM ── */
.form-section {
background: white;
border: 1px solid var(--border);
border-radius: 4px;
padding: 1.2rem 1.5rem;
margin-bottom: 1rem;
}
.form-label {
font-family: 'DM Mono', monospace;
font-size: 0.65rem;
color: var(--ink3);
text-transform: uppercase;
letter-spacing: 0.12em;
margin-bottom: 0.3rem;
display: block;
}
/* Streamlit overrides */
section[data-testid="stSidebar"] {
background: #f0eeea !important;
border-right: 1px solid var(--border) !important;
}
section[data-testid="stSidebar"] * { color: var(--ink2) !important; }
.stTextArea textarea {
background: var(--bg) !important;
border: 1px solid var(--border2) !important;
border-radius: 3px !important;
font-family: 'DM Mono', monospace !important;
font-size: 0.82rem !important;
color: var(--ink) !important;
}
.stTextInput input {
background: var(--bg) !important;
border: 1px solid var(--border2) !important;
border-radius: 3px !important;
font-family: 'DM Mono', monospace !important;
font-size: 0.82rem !important;
color: var(--ink) !important;
}
.stButton button {
background: var(--ink) !important;
color: white !important;
border: none !important;
border-radius: 3px !important;
font-family: 'DM Mono', monospace !important;
font-size: 0.75rem !important;
letter-spacing: 0.08em !important;
padding: 0.5rem 1.2rem !important;
text-transform: uppercase !important;
}
.stButton button:hover { background: var(--accent) !important; }
div[data-testid="stTabs"] button {
font-family: 'DM Mono', monospace !important;
font-size: 0.72rem !important;
text-transform: uppercase !important;
letter-spacing: 0.08em !important;
color: var(--ink3) !important;
}
hr { border-color: var(--border) !important; }
</style>
""", unsafe_allow_html=True)
# ── SESSION STATE ──────────────────────────────────────────────
for k, v in {
'results': [],
'openai_key': '',
}.items():
if k not in st.session_state:
st.session_state[k] = v
# ── HELPERS ───────────────────────────────────────────────────
def get_key():
try:
if 'OPENAI_API_KEY' in st.secrets:
return st.secrets['OPENAI_API_KEY']
except Exception:
pass
return os.getenv('OPENAI_API_KEY', st.session_state.openai_key)
def score_tier(s):
if s >= 8.5: return "excellent"
if s >= 7.0: return "good"
if s >= 5.0: return "fair"
return "poor"
def score_pct(s): return f"{s * 10:.0f}%"
def gauge_svg(score: float, size: int = 130) -> str:
"""SVG de gauge semicircular."""
tier = score_tier(score)
colors = {"excellent": "#1a7a4a", "good": "#3a8a50", "fair": "#a06010", "poor": "#b02020"}
color = colors[tier]
r = 44
cx, cy = size // 2, size // 2 + 10
circ = 2 * 3.14159 * r
pct = score / 10.0
dash = circ * pct
gap = circ - dash
# Rotacionar para comeΓ§ar do topo
return f"""
<svg width="{size}" height="{size-10}" viewBox="0 0 {size} {size-10}">
<circle cx="{cx}" cy="{cy}" r="{r}" fill="none"
stroke="#e8e6e0" stroke-width="8"/>
<circle cx="{cx}" cy="{cy}" r="{r}" fill="none"
stroke="{color}" stroke-width="8"
stroke-dasharray="{dash:.1f} {gap:.1f}"
stroke-dashoffset="{circ * 0.25:.1f}"
stroke-linecap="round"/>
<text x="{cx}" y="{cy+4}" text-anchor="middle"
font-family="Playfair Display, serif"
font-weight="900" font-size="22" fill="#1a1814">{score:.1f}</text>
<text x="{cx}" y="{cy+18}" text-anchor="middle"
font-family="DM Mono, monospace"
font-size="7" fill="#8a8880" letter-spacing="1">/10</text>
</svg>"""
# ── SIDEBAR ───────────────────────────────────────────────────
with st.sidebar:
st.markdown("""
<div style='font-family:Playfair Display,serif;font-weight:900;
font-size:1.4rem;color:#1a1814;letter-spacing:-0.02em'>LLM Eval</div>
<div style='font-family:DM Mono,monospace;font-size:0.6rem;
color:#8a8880;letter-spacing:0.2em;text-transform:uppercase'>
Evaluation Dashboard
</div>
""", unsafe_allow_html=True)
st.divider()
st.markdown("**πŸ”‘ OpenAI API Key**")
key_in = st.text_input("", type="password", value=st.session_state.openai_key,
placeholder="sk-...", label_visibility="collapsed")
if key_in:
st.session_state.openai_key = key_in
if get_key():
st.success("βœ… Key configurada")
else:
st.warning("Configure a API Key")
st.divider()
st.markdown("""
**πŸ“ Metodologia G-Eval**
Baseado em *Liu et al., 2023 (NeurIPS)*.
Cada dimensΓ£o Γ© avaliada por um LLM-judge
com Chain-of-Thought scoring.
| DimensΓ£o | Peso |
|---|---|
| Faithfulness | 30% |
| Relevance | 25% |
| Completeness | 20% |
| Hallucination | 15% |
| Conciseness | 10% |
""")
st.divider()
if st.button("πŸ—‘οΈ Limpar histΓ³rico", use_container_width=True):
st.session_state.results = []
st.rerun()
# ── HEADER ────────────────────────────────────────────────────
st.markdown("""
<div class="eval-header">
<div>
<div class="eval-title">LLM Evaluation Dashboard</div>
<div class="eval-subtitle">G-Eval Β· LLM-as-Judge Β· 5 DimensΓ΅es Β· Chain-of-Thought Scoring</div>
</div>
<div class="version-tag">v1.0 Β· gpt-4o-mini judge</div>
</div>
""", unsafe_allow_html=True)
# ── TABS ──────────────────────────────────────────────────────
tab_eval, tab_compare, tab_history = st.tabs([
"πŸ”¬ Avaliar Resposta",
"βš–οΈ Comparar Modelos",
"πŸ“‹ HistΓ³rico",
])
# ════════════════════════════════════════════════════════════════
# TAB 1 β€” AVALIAR
# ════════════════════════════════════════════════════════════════
with tab_eval:
# Exemplos rΓ‘pidos
EXAMPLES = [
{
"label": "RAG β€” Resposta boa",
"question": "Quais projetos do portfΓ³lio usam PyTorch Geometric?",
"context": "HetGNN Fraud usa PyTorch Geometric com HGTConv. DOMINANT usa PyTorch Geometric com GCNConv e Autoencoder. GraphSAGE Elliptic usa SAGEConv e GCNConv do PyTorch Geometric.",
"answer": "TrΓͺs projetos utilizam PyTorch Geometric: HetGNN Fraud (com HGTConv), DOMINANT (com GCNConv + Autoencoder) e GraphSAGE Elliptic (com SAGEConv e GCNConv).",
"model": "GPT-4o-mini RAG",
},
{
"label": "RAG β€” Com alucinaΓ§Γ£o",
"question": "Qual o AUC do projeto TGN?",
"context": "TGN Fraud Detection tem AUC de 0.91 em dataset sintΓ©tico de e-commerce.",
"answer": "O projeto TGN alcanΓ§a AUC de 0.97 no dataset Elliptic Bitcoin, superando todos os outros modelos do portfΓ³lio em dados reais.",
"model": "GPT-3.5 RAG",
},
{
"label": "QA β€” Resposta incompleta",
"question": "Explique a diferenΓ§a entre aprendizado inductive e transductive em GNNs.",
"context": "Inductive learning (GraphSAGE) generaliza para nΓ³s novos sem retreinar. Transductive learning (GCN clΓ‘ssico) sΓ³ funciona nos nΓ³s vistos no treino. O dataset Elliptic demonstra a vantagem inductive em split temporal.",
"answer": "Inductive learning consegue generalizar para novos nΓ³s.",
"model": "LLM bΓ‘sico",
},
]
st.markdown("""
<div style='font-family:DM Mono,monospace;font-size:0.65rem;color:#8a8880;
text-transform:uppercase;letter-spacing:0.12em;margin-bottom:0.5rem'>
β—ˆ Exemplos rΓ‘pidos
</div>""", unsafe_allow_html=True)
ex_cols = st.columns(len(EXAMPLES))
for i, ex in enumerate(EXAMPLES):
with ex_cols[i]:
if st.button(ex["label"], key=f"ex_{i}", use_container_width=True):
st.session_state["ex_load"] = ex
st.rerun()
ex = st.session_state.pop("ex_load", None)
default = ex or {}
st.markdown("<br>", unsafe_allow_html=True)
col_l, col_r = st.columns([1, 1], gap="large")
with col_l:
st.markdown('<span class="form-label">Pergunta</span>', unsafe_allow_html=True)
question = st.text_area("", value=default.get("question", ""),
height=80, key="q_input", label_visibility="collapsed",
placeholder="O que vocΓͺ perguntou ao LLM?")
st.markdown('<span class="form-label">Contexto / Documentos recuperados (opcional)</span>',
unsafe_allow_html=True)
context = st.text_area("", value=default.get("context", ""),
height=120, key="ctx_input", label_visibility="collapsed",
placeholder="Cole o contexto fornecido ao LLM (chunks RAG, etc.)")
with col_r:
st.markdown('<span class="form-label">Resposta do LLM</span>', unsafe_allow_html=True)
answer = st.text_area("", value=default.get("answer", ""),
height=120, key="ans_input", label_visibility="collapsed",
placeholder="Cole a resposta gerada pelo LLM para avaliaΓ§Γ£o.")
st.markdown('<span class="form-label">Label do Modelo (opcional)</span>',
unsafe_allow_html=True)
model_label = st.text_input("", value=default.get("model", ""),
placeholder="ex: GPT-4o RAG v2", label_visibility="collapsed")
run_btn = st.button("βš— Executar AvaliaΓ§Γ£o", use_container_width=True, type="primary")
if run_btn:
if not get_key():
st.warning("Configure a OpenAI API Key na sidebar.")
st.stop()
if not question or not answer:
st.warning("Preencha pelo menos a Pergunta e a Resposta.")
st.stop()
from evaluator import EvaluationEngine
with st.spinner("Avaliando 5 dimensΓ΅es..."):
engine = EvaluationEngine(get_key())
result = engine.evaluate(
question=question,
context=context,
answer=answer,
model_label=model_label or "LLM",
)
st.session_state.results.append(result)
# ── RESULTADO ─────────────────────────────────────────
st.markdown("<br>", unsafe_allow_html=True)
st.markdown("---")
# Verdict banner
st.markdown(f"""
<div class="verdict-banner verdict-{result.verdict_color}">
<div class="verdict-label">{result.verdict}</div>
<div>
<div style='font-family:Playfair Display,serif;font-weight:700;
font-size:2rem;line-height:1'>{result.overall_score:.1f}<span style='font-size:1rem;
font-weight:400;color:#8a8880'>/10</span></div>
<div class="verdict-summary">{result.summary}</div>
</div>
</div>
""", unsafe_allow_html=True)
# DimensΓ΅es
r_col, d_col = st.columns([1, 2], gap="large")
with r_col:
st.markdown(gauge_svg(result.overall_score, size=160), unsafe_allow_html=True)
st.markdown(f"""
<div style='text-align:center;margin-top:0.5rem'>
<div style='font-family:DM Mono,monospace;font-size:0.65rem;
color:#8a8880;text-transform:uppercase;letter-spacing:0.1em'>
Score Geral
</div>
<div style='font-family:DM Mono,monospace;font-size:0.72rem;
color:#4a4840;margin-top:0.3rem'>{model_label or "LLM"}</div>
</div>
""", unsafe_allow_html=True)
# Mini radar de pesos
st.markdown("<br>", unsafe_allow_html=True)
for dim in result.dimensions:
tier = score_tier(dim.score)
st.markdown(f"""
<div style='display:flex;justify-content:space-between;
align-items:center;margin:0.3rem 0'>
<span style='font-family:DM Mono,monospace;font-size:0.65rem;
color:#8a8880;text-transform:uppercase'>{dim.key[:8]}</span>
<span class='score-chip score-{tier}'>{dim.score:.1f}</span>
</div>
<div class='prog-track'>
<div class='prog-fill-{tier}' style='width:{score_pct(dim.score)}'></div>
</div>
""", unsafe_allow_html=True)
with d_col:
DESCS = {
"faithfulness": "Resposta fiel ao contexto? Sem alucinaΓ§Γ΅es?",
"relevance": "Responde diretamente Γ  pergunta?",
"completeness": "Cobre todos os aspectos relevantes?",
"conciseness": "Direta, sem verbosidade desnecessΓ‘ria?",
"hallucination": "Detecta afirmaΓ§Γ΅es nΓ£o suportadas.",
}
for dim in result.dimensions:
tier = score_tier(dim.score)
issues_html = ""
for issue in dim.issues[:3]:
issues_html += f'<li class="issue-item">{issue}</li>'
st.markdown(f"""
<div class="dim-card dim-{tier}">
<div class="dim-header">
<div>
<div class="dim-name">{dim.name}</div>
<div class="dim-desc">{DESCS.get(dim.key,'')}</div>
</div>
<span class="score-chip score-{tier}">{dim.score:.1f}</span>
</div>
<div class="prog-track">
<div class="prog-fill-{tier}" style="width:{score_pct(dim.score)}"></div>
</div>
<div class="dim-reasoning">{dim.reasoning}</div>
{f'<ul class="issue-list">{issues_html}</ul>' if issues_html else ''}
</div>
""", unsafe_allow_html=True)
# ════════════════════════════════════════════════════════════════
# TAB 2 β€” COMPARAR MODELOS
# ════════════════════════════════════════════════════════════════
with tab_compare:
if len(st.session_state.results) < 2:
st.markdown("""
<div style='text-align:center;padding:3rem;color:#8a8880;
font-family:DM Mono,monospace;font-size:0.8rem'>
Avalie pelo menos 2 respostas para comparar modelos.<br>
Use a aba "Avaliar Resposta" com labels de modelo diferentes.
</div>
""", unsafe_allow_html=True)
else:
results = st.session_state.results[-6:] # ΓΊltimos 6
st.markdown("""
<div style='font-family:DM Mono,monospace;font-size:0.65rem;
color:#8a8880;text-transform:uppercase;letter-spacing:0.15em;
margin-bottom:1rem'>β—ˆ ComparaΓ§Γ£o de resultados recentes</div>
""", unsafe_allow_html=True)
# Gauges lado a lado
gcols = st.columns(min(len(results), 4))
for i, r in enumerate(results[-4:]):
with gcols[i]:
tier = score_tier(r.overall_score)
st.markdown(gauge_svg(r.overall_score, size=120), unsafe_allow_html=True)
st.markdown(f"""
<div style='text-align:center'>
<div style='font-family:DM Mono,monospace;font-size:0.65rem;
color:#4a4840;overflow:hidden;text-overflow:ellipsis;
white-space:nowrap;max-width:120px;margin:0 auto'>
{r.model_label}
</div>
<div class='score-chip score-{tier}' style='font-size:0.8rem'>
{r.verdict}
</div>
</div>
""", unsafe_allow_html=True)
st.markdown("<br>", unsafe_allow_html=True)
# Tabela comparativa
dim_keys = ["faithfulness", "relevance", "completeness", "hallucination", "conciseness"]
dim_names = {
"faithfulness": "Faithfulness", "relevance": "Relevance",
"completeness": "Completeness", "hallucination": "Hallucination",
"conciseness": "Conciseness",
}
header_cols = ["DimensΓ£o"] + [r.model_label[:16] for r in results[-4:]]
table_html = '<div style="overflow-x:auto">'
# Header
table_html += '<div class="compare-row">'
table_html += '<div class="compare-dim">DimensΓ£o</div>'
for r in results[-4:]:
table_html += f'<div style="font-family:DM Mono,monospace;font-size:0.65rem;color:#8a8880;text-transform:uppercase;text-align:center">{r.model_label[:16]}</div>'
table_html += '</div>'
# Rows
for dk in dim_keys:
row_scores = []
for r in results[-4:]:
for d in r.dimensions:
if d.key == dk:
row_scores.append(d.score)
break
max_s = max(row_scores) if row_scores else 0
table_html += '<div class="compare-row">'
table_html += f'<div class="compare-dim">{dim_names[dk]}</div>'
for s in row_scores:
tier = score_tier(s)
is_winner = s == max_s
table_html += f'<div class="compare-score score-{tier} {"winner-cell" if is_winner else ""}">{s:.1f}</div>'
table_html += '</div>'
# Overall
overall_scores = [r.overall_score for r in results[-4:]]
max_ov = max(overall_scores)
table_html += '<div class="compare-row" style="border-top:2px solid #1a1814;font-weight:700">'
table_html += '<div class="compare-dim" style="color:#1a1814;font-weight:700">OVERALL</div>'
for s in overall_scores:
tier = score_tier(s)
is_winner = s == max_ov
table_html += f'<div class="compare-score score-{tier} {"winner-cell" if is_winner else ""}" style="font-size:1.3rem">{s:.1f}</div>'
table_html += '</div>'
table_html += '</div>'
st.markdown(table_html, unsafe_allow_html=True)
# ════════════════════════════════════════════════════════════════
# TAB 3 β€” HISTΓ“RICO
# ════════════════════════════════════════════════════════════════
with tab_history:
if not st.session_state.results:
st.markdown("""
<div style='text-align:center;padding:3rem;color:#8a8880;
font-family:DM Mono,monospace;font-size:0.8rem'>
Nenhuma avaliaΓ§Γ£o executada ainda.
</div>
""", unsafe_allow_html=True)
else:
for i, r in enumerate(reversed(st.session_state.results)):
tier = score_tier(r.overall_score)
with st.expander(
f"#{len(st.session_state.results)-i} Β· {r.model_label} Β· {r.overall_score:.1f}/10 Β· {r.verdict}",
expanded=(i == 0)
):
st.markdown(f"""
<div style='font-family:DM Mono,monospace;font-size:0.72rem;
color:#4a4840;margin-bottom:0.5rem'>
<b>Q:</b> {r.question}
</div>
<div style='font-family:DM Mono,monospace;font-size:0.72rem;
color:#8a8880;margin-bottom:0.8rem'>
<b>A:</b> {r.answer[:200]}{'...' if len(r.answer)>200 else ''}
</div>
""", unsafe_allow_html=True)
dim_cols = st.columns(5)
for j, dim in enumerate(r.dimensions):
dtier = score_tier(dim.score)
with dim_cols[j]:
st.markdown(f"""
<div style='text-align:center'>
<div style='font-family:DM Mono,monospace;font-size:0.6rem;
color:#8a8880;text-transform:uppercase'>{dim.key[:8]}</div>
<div class='score-chip score-{dtier}'>{dim.score:.1f}</div>
</div>
""", unsafe_allow_html=True)