# app.py — LLM Evaluation Dashboard | G-Eval Style | Daniel Fonseca import streamlit as st import os st.set_page_config( page_title="LLM Eval · Daniel Fonseca", page_icon="🔬", layout="wide", initial_sidebar_state="expanded", ) # ── CSS: LABORATÓRIO CIENTÍFICO ─────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── SESSION STATE ────────────────────────────────────────────── for k, v in { 'results': [], 'openai_key': '', }.items(): if k not in st.session_state: st.session_state[k] = v # ── HELPERS ─────────────────────────────────────────────────── def get_key(): try: if 'OPENAI_API_KEY' in st.secrets: return st.secrets['OPENAI_API_KEY'] except Exception: pass return os.getenv('OPENAI_API_KEY', st.session_state.openai_key) def score_tier(s): if s >= 8.5: return "excellent" if s >= 7.0: return "good" if s >= 5.0: return "fair" return "poor" def score_pct(s): return f"{s * 10:.0f}%" def gauge_svg(score: float, size: int = 130) -> str: """SVG de gauge semicircular.""" tier = score_tier(score) colors = {"excellent": "#1a7a4a", "good": "#3a8a50", "fair": "#a06010", "poor": "#b02020"} color = colors[tier] r = 44 cx, cy = size // 2, size // 2 + 10 circ = 2 * 3.14159 * r pct = score / 10.0 dash = circ * pct gap = circ - dash # Rotacionar para começar do topo return f""" """ # ── SIDEBAR ─────────────────────────────────────────────────── with st.sidebar: st.markdown("""

LLM Eval

    Evaluation Dashboard
    

""", unsafe_allow_html=True) st.divider() st.markdown("**🔑 OpenAI API Key**") key_in = st.text_input("", type="password", value=st.session_state.openai_key, placeholder="sk-...", label_visibility="collapsed") if key_in: st.session_state.openai_key = key_in if get_key(): st.success("✅ Key configurada") else: st.warning("Configure a API Key") st.divider() st.markdown(""" **📐 Metodologia G-Eval** Baseado em *Liu et al., 2023 (NeurIPS)*. Cada dimensão é avaliada por um LLM-judge com Chain-of-Thought scoring. | Dimensão | Peso | |---|---| | Faithfulness | 30% | | Relevance | 25% | | Completeness | 20% | | Hallucination | 15% | | Conciseness | 10% | """) st.divider() if st.button("🗑️ Limpar histórico", use_container_width=True): st.session_state.results = [] st.rerun() # ── HEADER ──────────────────────────────────────────────────── st.markdown("""

LLM Evaluation Dashboard

G-Eval · LLM-as-Judge · 5 Dimensões · Chain-of-Thought Scoring

v1.0 · gpt-4o-mini judge

""", unsafe_allow_html=True) # ── TABS ────────────────────────────────────────────────────── tab_eval, tab_compare, tab_history = st.tabs([ "🔬 Avaliar Resposta", "⚖️ Comparar Modelos", "📋 Histórico", ]) # ════════════════════════════════════════════════════════════════ # TAB 1 — AVALIAR # ════════════════════════════════════════════════════════════════ with tab_eval: # Exemplos rápidos EXAMPLES = [ { "label": "RAG — Resposta boa", "question": "Quais projetos do portfólio usam PyTorch Geometric?", "context": "HetGNN Fraud usa PyTorch Geometric com HGTConv. DOMINANT usa PyTorch Geometric com GCNConv e Autoencoder. GraphSAGE Elliptic usa SAGEConv e GCNConv do PyTorch Geometric.", "answer": "Três projetos utilizam PyTorch Geometric: HetGNN Fraud (com HGTConv), DOMINANT (com GCNConv + Autoencoder) e GraphSAGE Elliptic (com SAGEConv e GCNConv).", "model": "GPT-4o-mini RAG", }, { "label": "RAG — Com alucinação", "question": "Qual o AUC do projeto TGN?", "context": "TGN Fraud Detection tem AUC de 0.91 em dataset sintético de e-commerce.", "answer": "O projeto TGN alcança AUC de 0.97 no dataset Elliptic Bitcoin, superando todos os outros modelos do portfólio em dados reais.", "model": "GPT-3.5 RAG", }, { "label": "QA — Resposta incompleta", "question": "Explique a diferença entre aprendizado inductive e transductive em GNNs.", "context": "Inductive learning (GraphSAGE) generaliza para nós novos sem retreinar. Transductive learning (GCN clássico) só funciona nos nós vistos no treino. O dataset Elliptic demonstra a vantagem inductive em split temporal.", "answer": "Inductive learning consegue generalizar para novos nós.", "model": "LLM básico", }, ] st.markdown("""

    ◈ Exemplos rápidos
    

""", unsafe_allow_html=True) ex_cols = st.columns(len(EXAMPLES)) for i, ex in enumerate(EXAMPLES): with ex_cols[i]: if st.button(ex["label"], key=f"ex_{i}", use_container_width=True): st.session_state["ex_load"] = ex st.rerun() ex = st.session_state.pop("ex_load", None) default = ex or {} st.markdown("
", unsafe_allow_html=True) col_l, col_r = st.columns([1, 1], gap="large") with col_l: st.markdown('Pergunta', unsafe_allow_html=True) question = st.text_area("", value=default.get("question", ""), height=80, key="q_input", label_visibility="collapsed", placeholder="O que você perguntou ao LLM?") st.markdown('Contexto / Documentos recuperados (opcional)', unsafe_allow_html=True) context = st.text_area("", value=default.get("context", ""), height=120, key="ctx_input", label_visibility="collapsed", placeholder="Cole o contexto fornecido ao LLM (chunks RAG, etc.)") with col_r: st.markdown('Resposta do LLM', unsafe_allow_html=True) answer = st.text_area("", value=default.get("answer", ""), height=120, key="ans_input", label_visibility="collapsed", placeholder="Cole a resposta gerada pelo LLM para avaliação.") st.markdown('Label do Modelo (opcional)', unsafe_allow_html=True) model_label = st.text_input("", value=default.get("model", ""), placeholder="ex: GPT-4o RAG v2", label_visibility="collapsed") run_btn = st.button("⚗ Executar Avaliação", use_container_width=True, type="primary") if run_btn: if not get_key(): st.warning("Configure a OpenAI API Key na sidebar.") st.stop() if not question or not answer: st.warning("Preencha pelo menos a Pergunta e a Resposta.") st.stop() from evaluator import EvaluationEngine with st.spinner("Avaliando 5 dimensões..."): engine = EvaluationEngine(get_key()) result = engine.evaluate( question=question, context=context, answer=answer, model_label=model_label or "LLM", ) st.session_state.results.append(result) # ── RESULTADO ───────────────────────────────────────── st.markdown("
", unsafe_allow_html=True) st.markdown("---") # Verdict banner st.markdown(f""" """, unsafe_allow_html=True) # Dimensões r_col, d_col = st.columns([1, 2], gap="large") with r_col: st.markdown(gauge_svg(result.overall_score, size=160), unsafe_allow_html=True) st.markdown(f"""

              Score Geral
              

{model_label or "LLM"}

""", unsafe_allow_html=True) # Mini radar de pesos st.markdown("
", unsafe_allow_html=True) for dim in result.dimensions: tier = score_tier(dim.score) st.markdown(f"""

{dim.key[:8]} {dim.score:.1f}

""", unsafe_allow_html=True) with d_col: DESCS = { "faithfulness": "Resposta fiel ao contexto? Sem alucinações?", "relevance": "Responde diretamente à pergunta?", "completeness": "Cobre todos os aspectos relevantes?", "conciseness": "Direta, sem verbosidade desnecessária?", "hallucination": "Detecta afirmações não suportadas.", } for dim in result.dimensions: tier = score_tier(dim.score) issues_html = "" for issue in dim.issues[:3]: issues_html += f'

{issue}

' st.markdown(f"""

{dim.name}

{DESCS.get(dim.key,'')}

{dim.score:.1f}

{dim.reasoning}

{f'

{issues_html}' if issues_html else ''}

""", unsafe_allow_html=True) # ════════════════════════════════════════════════════════════════ # TAB 2 — COMPARAR MODELOS # ════════════════════════════════════════════════════════════════ with tab_compare: if len(st.session_state.results) < 2: st.markdown("""

        Avalie pelo menos 2 respostas para comparar modelos.

        Use a aba "Avaliar Resposta" com labels de modelo diferentes.

""", unsafe_allow_html=True) else: results = st.session_state.results[-6:] # últimos 6 st.markdown("""

◈ Comparação de resultados recentes

""", unsafe_allow_html=True) # Gauges lado a lado gcols = st.columns(min(len(results), 4)) for i, r in enumerate(results[-4:]): with gcols[i]: tier = score_tier(r.overall_score) st.markdown(gauge_svg(r.overall_score, size=120), unsafe_allow_html=True) st.markdown(f"""

                  {r.model_label}
                  

{r.verdict}

""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) # Tabela comparativa dim_keys = ["faithfulness", "relevance", "completeness", "hallucination", "conciseness"] dim_names = { "faithfulness": "Faithfulness", "relevance": "Relevance", "completeness": "Completeness", "hallucination": "Hallucination", "conciseness": "Conciseness", } header_cols = ["Dimensão"] + [r.model_label[:16] for r in results[-4:]] table_html = '

' # Header table_html += '

' table_html += '

Dimensão

' for r in results[-4:]: table_html += f'

{r.model_label[:16]}

' table_html += '

' # Rows for dk in dim_keys: row_scores = [] for r in results[-4:]: for d in r.dimensions: if d.key == dk: row_scores.append(d.score) break max_s = max(row_scores) if row_scores else 0 table_html += '

' table_html += f'

{dim_names[dk]}

' for s in row_scores: tier = score_tier(s) is_winner = s == max_s table_html += f'

{s:.1f}

' table_html += '

' # Overall overall_scores = [r.overall_score for r in results[-4:]] max_ov = max(overall_scores) table_html += '

' table_html += '

OVERALL

' for s in overall_scores: tier = score_tier(s) is_winner = s == max_ov table_html += f'

{s:.1f}

' table_html += '

' st.markdown(table_html, unsafe_allow_html=True) # ════════════════════════════════════════════════════════════════ # TAB 3 — HISTÓRICO # ════════════════════════════════════════════════════════════════ with tab_history: if not st.session_state.results: st.markdown("""

        Nenhuma avaliação executada ainda.
        

""", unsafe_allow_html=True) else: for i, r in enumerate(reversed(st.session_state.results)): tier = score_tier(r.overall_score) with st.expander( f"#{len(st.session_state.results)-i} · {r.model_label} · {r.overall_score:.1f}/10 · {r.verdict}", expanded=(i == 0) ): st.markdown(f"""

                Q: {r.question}
                

                A: {r.answer[:200]}{'...' if len(r.answer)>200 else ''}
                

""", unsafe_allow_html=True) dim_cols = st.columns(5) for j, dim in enumerate(r.dimensions): dtier = score_tier(dim.score) with dim_cols[j]: st.markdown(f"""

{dim.key[:8]}

{dim.score:.1f}

""", unsafe_allow_html=True)