""", unsafe_allow_html=True)
ex_cols = st.columns(len(EXAMPLES))
for i, ex in enumerate(EXAMPLES):
with ex_cols[i]:
if st.button(ex["label"], key=f"ex_{i}", use_container_width=True):
st.session_state["ex_load"] = ex
st.rerun()
ex = st.session_state.pop("ex_load", None)
default = ex or {}
st.markdown(" ", unsafe_allow_html=True)
col_l, col_r = st.columns([1, 1], gap="large")
with col_l:
st.markdown('Pergunta', unsafe_allow_html=True)
question = st.text_area("", value=default.get("question", ""),
height=80, key="q_input", label_visibility="collapsed",
placeholder="O que vocΓͺ perguntou ao LLM?")
st.markdown('Contexto / Documentos recuperados (opcional)',
unsafe_allow_html=True)
context = st.text_area("", value=default.get("context", ""),
height=120, key="ctx_input", label_visibility="collapsed",
placeholder="Cole o contexto fornecido ao LLM (chunks RAG, etc.)")
with col_r:
st.markdown('Resposta do LLM', unsafe_allow_html=True)
answer = st.text_area("", value=default.get("answer", ""),
height=120, key="ans_input", label_visibility="collapsed",
placeholder="Cole a resposta gerada pelo LLM para avaliaΓ§Γ£o.")
st.markdown('Label do Modelo (opcional)',
unsafe_allow_html=True)
model_label = st.text_input("", value=default.get("model", ""),
placeholder="ex: GPT-4o RAG v2", label_visibility="collapsed")
run_btn = st.button("β Executar AvaliaΓ§Γ£o", use_container_width=True, type="primary")
if run_btn:
if not get_key():
st.warning("Configure a OpenAI API Key na sidebar.")
st.stop()
if not question or not answer:
st.warning("Preencha pelo menos a Pergunta e a Resposta.")
st.stop()
from evaluator import EvaluationEngine
with st.spinner("Avaliando 5 dimensΓ΅es..."):
engine = EvaluationEngine(get_key())
result = engine.evaluate(
question=question,
context=context,
answer=answer,
model_label=model_label or "LLM",
)
st.session_state.results.append(result)
# ββ RESULTADO βββββββββββββββββββββββββββββββββββββββββ
st.markdown(" ", unsafe_allow_html=True)
st.markdown("---")
# Verdict banner
st.markdown(f"""
""", unsafe_allow_html=True)
# Mini radar de pesos
st.markdown(" ", unsafe_allow_html=True)
for dim in result.dimensions:
tier = score_tier(dim.score)
st.markdown(f"""
{dim.key[:8]}{dim.score:.1f}
""", unsafe_allow_html=True)
with d_col:
DESCS = {
"faithfulness": "Resposta fiel ao contexto? Sem alucinaΓ§Γ΅es?",
"relevance": "Responde diretamente Γ pergunta?",
"completeness": "Cobre todos os aspectos relevantes?",
"conciseness": "Direta, sem verbosidade desnecessΓ‘ria?",
"hallucination": "Detecta afirmaΓ§Γ΅es nΓ£o suportadas.",
}
for dim in result.dimensions:
tier = score_tier(dim.score)
issues_html = ""
for issue in dim.issues[:3]:
issues_html += f'
{issue}
'
st.markdown(f"""
{dim.name}
{DESCS.get(dim.key,'')}
{dim.score:.1f}
{dim.reasoning}
{f'
{issues_html}
' if issues_html else ''}
""", unsafe_allow_html=True)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# TAB 2 β COMPARAR MODELOS
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with tab_compare:
if len(st.session_state.results) < 2:
st.markdown("""
Avalie pelo menos 2 respostas para comparar modelos.
Use a aba "Avaliar Resposta" com labels de modelo diferentes.
""", unsafe_allow_html=True)
# Gauges lado a lado
gcols = st.columns(min(len(results), 4))
for i, r in enumerate(results[-4:]):
with gcols[i]:
tier = score_tier(r.overall_score)
st.markdown(gauge_svg(r.overall_score, size=120), unsafe_allow_html=True)
st.markdown(f"""
'
# Rows
for dk in dim_keys:
row_scores = []
for r in results[-4:]:
for d in r.dimensions:
if d.key == dk:
row_scores.append(d.score)
break
max_s = max(row_scores) if row_scores else 0
table_html += '
'
table_html += f'
{dim_names[dk]}
'
for s in row_scores:
tier = score_tier(s)
is_winner = s == max_s
table_html += f'
{s:.1f}
'
table_html += '
'
# Overall
overall_scores = [r.overall_score for r in results[-4:]]
max_ov = max(overall_scores)
table_html += '
'
table_html += '
OVERALL
'
for s in overall_scores:
tier = score_tier(s)
is_winner = s == max_ov
table_html += f'
{s:.1f}
'
table_html += '
'
table_html += '
'
st.markdown(table_html, unsafe_allow_html=True)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# TAB 3 β HISTΓRICO
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with tab_history:
if not st.session_state.results:
st.markdown("""
Nenhuma avaliaΓ§Γ£o executada ainda.
""", unsafe_allow_html=True)
else:
for i, r in enumerate(reversed(st.session_state.results)):
tier = score_tier(r.overall_score)
with st.expander(
f"#{len(st.session_state.results)-i} Β· {r.model_label} Β· {r.overall_score:.1f}/10 Β· {r.verdict}",
expanded=(i == 0)
):
st.markdown(f"""
Q: {r.question}
A: {r.answer[:200]}{'...' if len(r.answer)>200 else ''}
""", unsafe_allow_html=True)
dim_cols = st.columns(5)
for j, dim in enumerate(r.dimensions):
dtier = score_tier(dim.score)
with dim_cols[j]:
st.markdown(f"""