# app.py β€” LLM Evaluation Dashboard | G-Eval Style | Daniel Fonseca import streamlit as st import os st.set_page_config( page_title="LLM Eval Β· Daniel Fonseca", page_icon="πŸ”¬", layout="wide", initial_sidebar_state="expanded", ) # ── CSS: LABORATΓ“RIO CIENTÍFICO ─────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── SESSION STATE ────────────────────────────────────────────── for k, v in { 'results': [], 'openai_key': '', }.items(): if k not in st.session_state: st.session_state[k] = v # ── HELPERS ─────────────────────────────────────────────────── def get_key(): try: if 'OPENAI_API_KEY' in st.secrets: return st.secrets['OPENAI_API_KEY'] except Exception: pass return os.getenv('OPENAI_API_KEY', st.session_state.openai_key) def score_tier(s): if s >= 8.5: return "excellent" if s >= 7.0: return "good" if s >= 5.0: return "fair" return "poor" def score_pct(s): return f"{s * 10:.0f}%" def gauge_svg(score: float, size: int = 130) -> str: """SVG de gauge semicircular.""" tier = score_tier(score) colors = {"excellent": "#1a7a4a", "good": "#3a8a50", "fair": "#a06010", "poor": "#b02020"} color = colors[tier] r = 44 cx, cy = size // 2, size // 2 + 10 circ = 2 * 3.14159 * r pct = score / 10.0 dash = circ * pct gap = circ - dash # Rotacionar para comeΓ§ar do topo return f""" {score:.1f} /10 """ # ── SIDEBAR ─────────────────────────────────────────────────── with st.sidebar: st.markdown("""
LLM Eval
Evaluation Dashboard
""", unsafe_allow_html=True) st.divider() st.markdown("**πŸ”‘ OpenAI API Key**") key_in = st.text_input("", type="password", value=st.session_state.openai_key, placeholder="sk-...", label_visibility="collapsed") if key_in: st.session_state.openai_key = key_in if get_key(): st.success("βœ… Key configurada") else: st.warning("Configure a API Key") st.divider() st.markdown(""" **πŸ“ Metodologia G-Eval** Baseado em *Liu et al., 2023 (NeurIPS)*. Cada dimensΓ£o Γ© avaliada por um LLM-judge com Chain-of-Thought scoring. | DimensΓ£o | Peso | |---|---| | Faithfulness | 30% | | Relevance | 25% | | Completeness | 20% | | Hallucination | 15% | | Conciseness | 10% | """) st.divider() if st.button("πŸ—‘οΈ Limpar histΓ³rico", use_container_width=True): st.session_state.results = [] st.rerun() # ── HEADER ──────────────────────────────────────────────────── st.markdown("""
LLM Evaluation Dashboard
G-Eval Β· LLM-as-Judge Β· 5 DimensΓ΅es Β· Chain-of-Thought Scoring
v1.0 Β· gpt-4o-mini judge
""", unsafe_allow_html=True) # ── TABS ────────────────────────────────────────────────────── tab_eval, tab_compare, tab_history = st.tabs([ "πŸ”¬ Avaliar Resposta", "βš–οΈ Comparar Modelos", "πŸ“‹ HistΓ³rico", ]) # ════════════════════════════════════════════════════════════════ # TAB 1 β€” AVALIAR # ════════════════════════════════════════════════════════════════ with tab_eval: # Exemplos rΓ‘pidos EXAMPLES = [ { "label": "RAG β€” Resposta boa", "question": "Quais projetos do portfΓ³lio usam PyTorch Geometric?", "context": "HetGNN Fraud usa PyTorch Geometric com HGTConv. DOMINANT usa PyTorch Geometric com GCNConv e Autoencoder. GraphSAGE Elliptic usa SAGEConv e GCNConv do PyTorch Geometric.", "answer": "TrΓͺs projetos utilizam PyTorch Geometric: HetGNN Fraud (com HGTConv), DOMINANT (com GCNConv + Autoencoder) e GraphSAGE Elliptic (com SAGEConv e GCNConv).", "model": "GPT-4o-mini RAG", }, { "label": "RAG β€” Com alucinaΓ§Γ£o", "question": "Qual o AUC do projeto TGN?", "context": "TGN Fraud Detection tem AUC de 0.91 em dataset sintΓ©tico de e-commerce.", "answer": "O projeto TGN alcanΓ§a AUC de 0.97 no dataset Elliptic Bitcoin, superando todos os outros modelos do portfΓ³lio em dados reais.", "model": "GPT-3.5 RAG", }, { "label": "QA β€” Resposta incompleta", "question": "Explique a diferenΓ§a entre aprendizado inductive e transductive em GNNs.", "context": "Inductive learning (GraphSAGE) generaliza para nΓ³s novos sem retreinar. Transductive learning (GCN clΓ‘ssico) sΓ³ funciona nos nΓ³s vistos no treino. O dataset Elliptic demonstra a vantagem inductive em split temporal.", "answer": "Inductive learning consegue generalizar para novos nΓ³s.", "model": "LLM bΓ‘sico", }, ] st.markdown("""
β—ˆ Exemplos rΓ‘pidos
""", unsafe_allow_html=True) ex_cols = st.columns(len(EXAMPLES)) for i, ex in enumerate(EXAMPLES): with ex_cols[i]: if st.button(ex["label"], key=f"ex_{i}", use_container_width=True): st.session_state["ex_load"] = ex st.rerun() ex = st.session_state.pop("ex_load", None) default = ex or {} st.markdown("
", unsafe_allow_html=True) col_l, col_r = st.columns([1, 1], gap="large") with col_l: st.markdown('Pergunta', unsafe_allow_html=True) question = st.text_area("", value=default.get("question", ""), height=80, key="q_input", label_visibility="collapsed", placeholder="O que vocΓͺ perguntou ao LLM?") st.markdown('Contexto / Documentos recuperados (opcional)', unsafe_allow_html=True) context = st.text_area("", value=default.get("context", ""), height=120, key="ctx_input", label_visibility="collapsed", placeholder="Cole o contexto fornecido ao LLM (chunks RAG, etc.)") with col_r: st.markdown('Resposta do LLM', unsafe_allow_html=True) answer = st.text_area("", value=default.get("answer", ""), height=120, key="ans_input", label_visibility="collapsed", placeholder="Cole a resposta gerada pelo LLM para avaliaΓ§Γ£o.") st.markdown('Label do Modelo (opcional)', unsafe_allow_html=True) model_label = st.text_input("", value=default.get("model", ""), placeholder="ex: GPT-4o RAG v2", label_visibility="collapsed") run_btn = st.button("βš— Executar AvaliaΓ§Γ£o", use_container_width=True, type="primary") if run_btn: if not get_key(): st.warning("Configure a OpenAI API Key na sidebar.") st.stop() if not question or not answer: st.warning("Preencha pelo menos a Pergunta e a Resposta.") st.stop() from evaluator import EvaluationEngine with st.spinner("Avaliando 5 dimensΓ΅es..."): engine = EvaluationEngine(get_key()) result = engine.evaluate( question=question, context=context, answer=answer, model_label=model_label or "LLM", ) st.session_state.results.append(result) # ── RESULTADO ───────────────────────────────────────── st.markdown("
", unsafe_allow_html=True) st.markdown("---") # Verdict banner st.markdown(f"""
{result.verdict}
{result.overall_score:.1f}/10
{result.summary}
""", unsafe_allow_html=True) # DimensΓ΅es r_col, d_col = st.columns([1, 2], gap="large") with r_col: st.markdown(gauge_svg(result.overall_score, size=160), unsafe_allow_html=True) st.markdown(f"""
Score Geral
{model_label or "LLM"}
""", unsafe_allow_html=True) # Mini radar de pesos st.markdown("
", unsafe_allow_html=True) for dim in result.dimensions: tier = score_tier(dim.score) st.markdown(f"""
{dim.key[:8]} {dim.score:.1f}
""", unsafe_allow_html=True) with d_col: DESCS = { "faithfulness": "Resposta fiel ao contexto? Sem alucinaΓ§Γ΅es?", "relevance": "Responde diretamente Γ  pergunta?", "completeness": "Cobre todos os aspectos relevantes?", "conciseness": "Direta, sem verbosidade desnecessΓ‘ria?", "hallucination": "Detecta afirmaΓ§Γ΅es nΓ£o suportadas.", } for dim in result.dimensions: tier = score_tier(dim.score) issues_html = "" for issue in dim.issues[:3]: issues_html += f'
  • {issue}
  • ' st.markdown(f"""
    {dim.name}
    {DESCS.get(dim.key,'')}
    {dim.score:.1f}
    {dim.reasoning}
    {f'' if issues_html else ''}
    """, unsafe_allow_html=True) # ════════════════════════════════════════════════════════════════ # TAB 2 β€” COMPARAR MODELOS # ════════════════════════════════════════════════════════════════ with tab_compare: if len(st.session_state.results) < 2: st.markdown("""
    Avalie pelo menos 2 respostas para comparar modelos.
    Use a aba "Avaliar Resposta" com labels de modelo diferentes.
    """, unsafe_allow_html=True) else: results = st.session_state.results[-6:] # ΓΊltimos 6 st.markdown("""
    β—ˆ ComparaΓ§Γ£o de resultados recentes
    """, unsafe_allow_html=True) # Gauges lado a lado gcols = st.columns(min(len(results), 4)) for i, r in enumerate(results[-4:]): with gcols[i]: tier = score_tier(r.overall_score) st.markdown(gauge_svg(r.overall_score, size=120), unsafe_allow_html=True) st.markdown(f"""
    {r.model_label}
    {r.verdict}
    """, unsafe_allow_html=True) st.markdown("
    ", unsafe_allow_html=True) # Tabela comparativa dim_keys = ["faithfulness", "relevance", "completeness", "hallucination", "conciseness"] dim_names = { "faithfulness": "Faithfulness", "relevance": "Relevance", "completeness": "Completeness", "hallucination": "Hallucination", "conciseness": "Conciseness", } header_cols = ["DimensΓ£o"] + [r.model_label[:16] for r in results[-4:]] table_html = '
    ' # Header table_html += '
    ' table_html += '
    DimensΓ£o
    ' for r in results[-4:]: table_html += f'
    {r.model_label[:16]}
    ' table_html += '
    ' # Rows for dk in dim_keys: row_scores = [] for r in results[-4:]: for d in r.dimensions: if d.key == dk: row_scores.append(d.score) break max_s = max(row_scores) if row_scores else 0 table_html += '
    ' table_html += f'
    {dim_names[dk]}
    ' for s in row_scores: tier = score_tier(s) is_winner = s == max_s table_html += f'
    {s:.1f}
    ' table_html += '
    ' # Overall overall_scores = [r.overall_score for r in results[-4:]] max_ov = max(overall_scores) table_html += '
    ' table_html += '
    OVERALL
    ' for s in overall_scores: tier = score_tier(s) is_winner = s == max_ov table_html += f'
    {s:.1f}
    ' table_html += '
    ' table_html += '
    ' st.markdown(table_html, unsafe_allow_html=True) # ════════════════════════════════════════════════════════════════ # TAB 3 β€” HISTΓ“RICO # ════════════════════════════════════════════════════════════════ with tab_history: if not st.session_state.results: st.markdown("""
    Nenhuma avaliaΓ§Γ£o executada ainda.
    """, unsafe_allow_html=True) else: for i, r in enumerate(reversed(st.session_state.results)): tier = score_tier(r.overall_score) with st.expander( f"#{len(st.session_state.results)-i} Β· {r.model_label} Β· {r.overall_score:.1f}/10 Β· {r.verdict}", expanded=(i == 0) ): st.markdown(f"""
    Q: {r.question}
    A: {r.answer[:200]}{'...' if len(r.answer)>200 else ''}
    """, unsafe_allow_html=True) dim_cols = st.columns(5) for j, dim in enumerate(r.dimensions): dtier = score_tier(dim.score) with dim_cols[j]: st.markdown(f"""
    {dim.key[:8]}
    {dim.score:.1f}
    """, unsafe_allow_html=True)