Spaces:
Sleeping
Sleeping
| # app.py β LLM Evaluation Dashboard | G-Eval Style | Daniel Fonseca | |
| import streamlit as st | |
| import os | |
| st.set_page_config( | |
| page_title="LLM Eval Β· Daniel Fonseca", | |
| page_icon="π¬", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| # ββ CSS: LABORATΓRIO CIENTΓFICO βββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=DM+Mono:ital,wght@0,400;0,500;1,400&family=Playfair+Display:wght@400;700;900&family=DM+Sans:wght@300;400;500&display=swap'); | |
| :root { | |
| --bg: #f5f4f0; | |
| --bg2: #eeecea; | |
| --bg3: #e6e4e0; | |
| --ink: #1a1814; | |
| --ink2: #4a4840; | |
| --ink3: #8a8880; | |
| --border: #d8d6d0; | |
| --border2: #c8c6c0; | |
| --green: #1a7a4a; | |
| --green-bg: #e8f5ee; | |
| --amber: #a06010; | |
| --amber-bg: #fef3dc; | |
| --red: #b02020; | |
| --red-bg: #fdecea; | |
| --blue: #1a4a8a; | |
| --blue-bg: #e8eef8; | |
| --accent: #2d5a9e; | |
| } | |
| html, body, [class*="css"] { | |
| background: var(--bg) !important; | |
| color: var(--ink) !important; | |
| font-family: 'DM Sans', sans-serif; | |
| } | |
| #MainMenu, footer, header { visibility: hidden; } | |
| .block-container { padding-top: 1.2rem; max-width: 1280px; } | |
| /* ββ HEADER ββ */ | |
| .eval-header { | |
| border-bottom: 2px solid var(--ink); | |
| padding-bottom: 1rem; | |
| margin-bottom: 1.5rem; | |
| display: flex; | |
| align-items: flex-end; | |
| justify-content: space-between; | |
| } | |
| .eval-title { | |
| font-family: 'Playfair Display', Georgia, serif; | |
| font-weight: 900; | |
| font-size: 2.2rem; | |
| color: var(--ink); | |
| letter-spacing: -0.02em; | |
| line-height: 1; | |
| } | |
| .eval-subtitle { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 0.65rem; | |
| color: var(--ink3); | |
| letter-spacing: 0.18em; | |
| text-transform: uppercase; | |
| margin-top: 0.3rem; | |
| } | |
| .version-tag { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 0.65rem; | |
| color: var(--ink3); | |
| border: 1px solid var(--border2); | |
| padding: 0.2rem 0.6rem; | |
| border-radius: 2px; | |
| letter-spacing: 0.1em; | |
| } | |
| /* ββ GAUGE / SCORE RING ββ */ | |
| .gauge-container { | |
| text-align: center; | |
| position: relative; | |
| } | |
| .gauge-ring-svg { display: block; margin: 0 auto; } | |
| .gauge-value { | |
| font-family: 'Playfair Display', serif; | |
| font-weight: 900; | |
| font-size: 2.4rem; | |
| color: var(--ink); | |
| line-height: 1; | |
| } | |
| .gauge-label { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 0.6rem; | |
| color: var(--ink3); | |
| letter-spacing: 0.15em; | |
| text-transform: uppercase; | |
| margin-top: 0.2rem; | |
| } | |
| /* ββ DIMENSION CARD ββ */ | |
| .dim-card { | |
| background: white; | |
| border: 1px solid var(--border); | |
| border-radius: 4px; | |
| padding: 1rem 1.2rem; | |
| margin-bottom: 0.6rem; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .dim-card::before { | |
| content: ''; | |
| position: absolute; | |
| left: 0; top: 0; bottom: 0; | |
| width: 3px; | |
| } | |
| .dim-excellent::before { background: var(--green); } | |
| .dim-good::before { background: #5aaa70; } | |
| .dim-fair::before { background: var(--amber); } | |
| .dim-poor::before { background: var(--red); } | |
| .dim-header { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: flex-start; | |
| margin-bottom: 0.5rem; | |
| } | |
| .dim-name { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 0.72rem; | |
| font-weight: 500; | |
| color: var(--ink2); | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| } | |
| .dim-desc { | |
| font-size: 0.8rem; | |
| color: var(--ink3); | |
| margin-top: 0.1rem; | |
| } | |
| .score-chip { | |
| font-family: 'Playfair Display', serif; | |
| font-weight: 700; | |
| font-size: 1.3rem; | |
| line-height: 1; | |
| } | |
| .score-excellent { color: var(--green); } | |
| .score-good { color: #3a8a50; } | |
| .score-fair { color: var(--amber); } | |
| .score-poor { color: var(--red); } | |
| /* Progress bar */ | |
| .prog-track { | |
| background: var(--bg3); | |
| border-radius: 2px; | |
| height: 5px; | |
| width: 100%; | |
| margin: 0.5rem 0; | |
| overflow: hidden; | |
| } | |
| .prog-fill-excellent { height: 100%; background: var(--green); border-radius: 2px; transition: width 0.6s ease; } | |
| .prog-fill-good { height: 100%; background: #3a8a50; border-radius: 2px; transition: width 0.6s ease; } | |
| .prog-fill-fair { height: 100%; background: var(--amber); border-radius: 2px; transition: width 0.6s ease; } | |
| .prog-fill-poor { height: 100%; background: var(--red); border-radius: 2px; transition: width 0.6s ease; } | |
| .dim-reasoning { | |
| font-size: 0.85rem; | |
| color: var(--ink2); | |
| line-height: 1.6; | |
| font-style: italic; | |
| margin-top: 0.4rem; | |
| } | |
| .issue-list { | |
| margin-top: 0.4rem; | |
| padding: 0; | |
| list-style: none; | |
| } | |
| .issue-item { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 0.72rem; | |
| color: var(--red); | |
| padding: 0.15rem 0; | |
| display: flex; | |
| align-items: flex-start; | |
| gap: 0.4rem; | |
| } | |
| .issue-item::before { content: 'β³'; color: var(--border2); } | |
| /* ββ VERDICT BANNER ββ */ | |
| .verdict-banner { | |
| border: 1px solid var(--border); | |
| border-radius: 4px; | |
| padding: 1.2rem 1.5rem; | |
| margin-bottom: 1rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 1.5rem; | |
| } | |
| .verdict-excellent { background: var(--green-bg); border-color: var(--green); } | |
| .verdict-good { background: #eef8f2; border-color: #3a8a50; } | |
| .verdict-fair { background: var(--amber-bg); border-color: var(--amber); } | |
| .verdict-poor { background: var(--red-bg); border-color: var(--red); } | |
| .verdict-label { | |
| font-family: 'Playfair Display', serif; | |
| font-weight: 900; | |
| font-size: 1.8rem; | |
| letter-spacing: -0.02em; | |
| } | |
| .verdict-excellent .verdict-label { color: var(--green); } | |
| .verdict-good .verdict-label { color: #3a8a50; } | |
| .verdict-fair .verdict-label { color: var(--amber); } | |
| .verdict-poor .verdict-label { color: var(--red); } | |
| .verdict-summary { | |
| font-size: 0.9rem; | |
| color: var(--ink2); | |
| line-height: 1.6; | |
| } | |
| /* ββ COMPARISON TABLE ββ */ | |
| .compare-row { | |
| display: grid; | |
| grid-template-columns: 160px 1fr 1fr; | |
| gap: 0; | |
| border-bottom: 1px solid var(--border); | |
| padding: 0.5rem 0; | |
| align-items: center; | |
| } | |
| .compare-row:first-child { | |
| border-top: 2px solid var(--ink); | |
| font-family: 'DM Mono', monospace; | |
| font-size: 0.65rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| color: var(--ink3); | |
| padding-top: 0.4rem; | |
| } | |
| .compare-dim { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 0.72rem; | |
| color: var(--ink3); | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| .compare-score { | |
| font-family: 'Playfair Display', serif; | |
| font-weight: 700; | |
| font-size: 1.1rem; | |
| text-align: center; | |
| } | |
| .winner-cell { | |
| background: #fffbe6; | |
| border-radius: 2px; | |
| } | |
| /* ββ INPUT FORM ββ */ | |
| .form-section { | |
| background: white; | |
| border: 1px solid var(--border); | |
| border-radius: 4px; | |
| padding: 1.2rem 1.5rem; | |
| margin-bottom: 1rem; | |
| } | |
| .form-label { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 0.65rem; | |
| color: var(--ink3); | |
| text-transform: uppercase; | |
| letter-spacing: 0.12em; | |
| margin-bottom: 0.3rem; | |
| display: block; | |
| } | |
| /* Streamlit overrides */ | |
| section[data-testid="stSidebar"] { | |
| background: #f0eeea !important; | |
| border-right: 1px solid var(--border) !important; | |
| } | |
| section[data-testid="stSidebar"] * { color: var(--ink2) !important; } | |
| .stTextArea textarea { | |
| background: var(--bg) !important; | |
| border: 1px solid var(--border2) !important; | |
| border-radius: 3px !important; | |
| font-family: 'DM Mono', monospace !important; | |
| font-size: 0.82rem !important; | |
| color: var(--ink) !important; | |
| } | |
| .stTextInput input { | |
| background: var(--bg) !important; | |
| border: 1px solid var(--border2) !important; | |
| border-radius: 3px !important; | |
| font-family: 'DM Mono', monospace !important; | |
| font-size: 0.82rem !important; | |
| color: var(--ink) !important; | |
| } | |
| .stButton button { | |
| background: var(--ink) !important; | |
| color: white !important; | |
| border: none !important; | |
| border-radius: 3px !important; | |
| font-family: 'DM Mono', monospace !important; | |
| font-size: 0.75rem !important; | |
| letter-spacing: 0.08em !important; | |
| padding: 0.5rem 1.2rem !important; | |
| text-transform: uppercase !important; | |
| } | |
| .stButton button:hover { background: var(--accent) !important; } | |
| div[data-testid="stTabs"] button { | |
| font-family: 'DM Mono', monospace !important; | |
| font-size: 0.72rem !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.08em !important; | |
| color: var(--ink3) !important; | |
| } | |
| hr { border-color: var(--border) !important; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ββ SESSION STATE ββββββββββββββββββββββββββββββββββββββββββββββ | |
| for k, v in { | |
| 'results': [], | |
| 'openai_key': '', | |
| }.items(): | |
| if k not in st.session_state: | |
| st.session_state[k] = v | |
| # ββ HELPERS βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_key(): | |
| try: | |
| if 'OPENAI_API_KEY' in st.secrets: | |
| return st.secrets['OPENAI_API_KEY'] | |
| except Exception: | |
| pass | |
| return os.getenv('OPENAI_API_KEY', st.session_state.openai_key) | |
| def score_tier(s): | |
| if s >= 8.5: return "excellent" | |
| if s >= 7.0: return "good" | |
| if s >= 5.0: return "fair" | |
| return "poor" | |
| def score_pct(s): return f"{s * 10:.0f}%" | |
| def gauge_svg(score: float, size: int = 130) -> str: | |
| """SVG de gauge semicircular.""" | |
| tier = score_tier(score) | |
| colors = {"excellent": "#1a7a4a", "good": "#3a8a50", "fair": "#a06010", "poor": "#b02020"} | |
| color = colors[tier] | |
| r = 44 | |
| cx, cy = size // 2, size // 2 + 10 | |
| circ = 2 * 3.14159 * r | |
| pct = score / 10.0 | |
| dash = circ * pct | |
| gap = circ - dash | |
| # Rotacionar para comeΓ§ar do topo | |
| return f""" | |
| <svg width="{size}" height="{size-10}" viewBox="0 0 {size} {size-10}"> | |
| <circle cx="{cx}" cy="{cy}" r="{r}" fill="none" | |
| stroke="#e8e6e0" stroke-width="8"/> | |
| <circle cx="{cx}" cy="{cy}" r="{r}" fill="none" | |
| stroke="{color}" stroke-width="8" | |
| stroke-dasharray="{dash:.1f} {gap:.1f}" | |
| stroke-dashoffset="{circ * 0.25:.1f}" | |
| stroke-linecap="round"/> | |
| <text x="{cx}" y="{cy+4}" text-anchor="middle" | |
| font-family="Playfair Display, serif" | |
| font-weight="900" font-size="22" fill="#1a1814">{score:.1f}</text> | |
| <text x="{cx}" y="{cy+18}" text-anchor="middle" | |
| font-family="DM Mono, monospace" | |
| font-size="7" fill="#8a8880" letter-spacing="1">/10</text> | |
| </svg>""" | |
| # ββ SIDEBAR βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.sidebar: | |
| st.markdown(""" | |
| <div style='font-family:Playfair Display,serif;font-weight:900; | |
| font-size:1.4rem;color:#1a1814;letter-spacing:-0.02em'>LLM Eval</div> | |
| <div style='font-family:DM Mono,monospace;font-size:0.6rem; | |
| color:#8a8880;letter-spacing:0.2em;text-transform:uppercase'> | |
| Evaluation Dashboard | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.divider() | |
| st.markdown("**π OpenAI API Key**") | |
| key_in = st.text_input("", type="password", value=st.session_state.openai_key, | |
| placeholder="sk-...", label_visibility="collapsed") | |
| if key_in: | |
| st.session_state.openai_key = key_in | |
| if get_key(): | |
| st.success("β Key configurada") | |
| else: | |
| st.warning("Configure a API Key") | |
| st.divider() | |
| st.markdown(""" | |
| **π Metodologia G-Eval** | |
| Baseado em *Liu et al., 2023 (NeurIPS)*. | |
| Cada dimensΓ£o Γ© avaliada por um LLM-judge | |
| com Chain-of-Thought scoring. | |
| | DimensΓ£o | Peso | | |
| |---|---| | |
| | Faithfulness | 30% | | |
| | Relevance | 25% | | |
| | Completeness | 20% | | |
| | Hallucination | 15% | | |
| | Conciseness | 10% | | |
| """) | |
| st.divider() | |
| if st.button("ποΈ Limpar histΓ³rico", use_container_width=True): | |
| st.session_state.results = [] | |
| st.rerun() | |
| # ββ HEADER ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <div class="eval-header"> | |
| <div> | |
| <div class="eval-title">LLM Evaluation Dashboard</div> | |
| <div class="eval-subtitle">G-Eval Β· LLM-as-Judge Β· 5 DimensΓ΅es Β· Chain-of-Thought Scoring</div> | |
| </div> | |
| <div class="version-tag">v1.0 Β· gpt-4o-mini judge</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ββ TABS ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tab_eval, tab_compare, tab_history = st.tabs([ | |
| "π¬ Avaliar Resposta", | |
| "βοΈ Comparar Modelos", | |
| "π HistΓ³rico", | |
| ]) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 1 β AVALIAR | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_eval: | |
| # Exemplos rΓ‘pidos | |
| EXAMPLES = [ | |
| { | |
| "label": "RAG β Resposta boa", | |
| "question": "Quais projetos do portfΓ³lio usam PyTorch Geometric?", | |
| "context": "HetGNN Fraud usa PyTorch Geometric com HGTConv. DOMINANT usa PyTorch Geometric com GCNConv e Autoencoder. GraphSAGE Elliptic usa SAGEConv e GCNConv do PyTorch Geometric.", | |
| "answer": "TrΓͺs projetos utilizam PyTorch Geometric: HetGNN Fraud (com HGTConv), DOMINANT (com GCNConv + Autoencoder) e GraphSAGE Elliptic (com SAGEConv e GCNConv).", | |
| "model": "GPT-4o-mini RAG", | |
| }, | |
| { | |
| "label": "RAG β Com alucinaΓ§Γ£o", | |
| "question": "Qual o AUC do projeto TGN?", | |
| "context": "TGN Fraud Detection tem AUC de 0.91 em dataset sintΓ©tico de e-commerce.", | |
| "answer": "O projeto TGN alcanΓ§a AUC de 0.97 no dataset Elliptic Bitcoin, superando todos os outros modelos do portfΓ³lio em dados reais.", | |
| "model": "GPT-3.5 RAG", | |
| }, | |
| { | |
| "label": "QA β Resposta incompleta", | |
| "question": "Explique a diferenΓ§a entre aprendizado inductive e transductive em GNNs.", | |
| "context": "Inductive learning (GraphSAGE) generaliza para nΓ³s novos sem retreinar. Transductive learning (GCN clΓ‘ssico) sΓ³ funciona nos nΓ³s vistos no treino. O dataset Elliptic demonstra a vantagem inductive em split temporal.", | |
| "answer": "Inductive learning consegue generalizar para novos nΓ³s.", | |
| "model": "LLM bΓ‘sico", | |
| }, | |
| ] | |
| st.markdown(""" | |
| <div style='font-family:DM Mono,monospace;font-size:0.65rem;color:#8a8880; | |
| text-transform:uppercase;letter-spacing:0.12em;margin-bottom:0.5rem'> | |
| β Exemplos rΓ‘pidos | |
| </div>""", unsafe_allow_html=True) | |
| ex_cols = st.columns(len(EXAMPLES)) | |
| for i, ex in enumerate(EXAMPLES): | |
| with ex_cols[i]: | |
| if st.button(ex["label"], key=f"ex_{i}", use_container_width=True): | |
| st.session_state["ex_load"] = ex | |
| st.rerun() | |
| ex = st.session_state.pop("ex_load", None) | |
| default = ex or {} | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| col_l, col_r = st.columns([1, 1], gap="large") | |
| with col_l: | |
| st.markdown('<span class="form-label">Pergunta</span>', unsafe_allow_html=True) | |
| question = st.text_area("", value=default.get("question", ""), | |
| height=80, key="q_input", label_visibility="collapsed", | |
| placeholder="O que vocΓͺ perguntou ao LLM?") | |
| st.markdown('<span class="form-label">Contexto / Documentos recuperados (opcional)</span>', | |
| unsafe_allow_html=True) | |
| context = st.text_area("", value=default.get("context", ""), | |
| height=120, key="ctx_input", label_visibility="collapsed", | |
| placeholder="Cole o contexto fornecido ao LLM (chunks RAG, etc.)") | |
| with col_r: | |
| st.markdown('<span class="form-label">Resposta do LLM</span>', unsafe_allow_html=True) | |
| answer = st.text_area("", value=default.get("answer", ""), | |
| height=120, key="ans_input", label_visibility="collapsed", | |
| placeholder="Cole a resposta gerada pelo LLM para avaliaΓ§Γ£o.") | |
| st.markdown('<span class="form-label">Label do Modelo (opcional)</span>', | |
| unsafe_allow_html=True) | |
| model_label = st.text_input("", value=default.get("model", ""), | |
| placeholder="ex: GPT-4o RAG v2", label_visibility="collapsed") | |
| run_btn = st.button("β Executar AvaliaΓ§Γ£o", use_container_width=True, type="primary") | |
| if run_btn: | |
| if not get_key(): | |
| st.warning("Configure a OpenAI API Key na sidebar.") | |
| st.stop() | |
| if not question or not answer: | |
| st.warning("Preencha pelo menos a Pergunta e a Resposta.") | |
| st.stop() | |
| from evaluator import EvaluationEngine | |
| with st.spinner("Avaliando 5 dimensΓ΅es..."): | |
| engine = EvaluationEngine(get_key()) | |
| result = engine.evaluate( | |
| question=question, | |
| context=context, | |
| answer=answer, | |
| model_label=model_label or "LLM", | |
| ) | |
| st.session_state.results.append(result) | |
| # ββ RESULTADO βββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| st.markdown("---") | |
| # Verdict banner | |
| st.markdown(f""" | |
| <div class="verdict-banner verdict-{result.verdict_color}"> | |
| <div class="verdict-label">{result.verdict}</div> | |
| <div> | |
| <div style='font-family:Playfair Display,serif;font-weight:700; | |
| font-size:2rem;line-height:1'>{result.overall_score:.1f}<span style='font-size:1rem; | |
| font-weight:400;color:#8a8880'>/10</span></div> | |
| <div class="verdict-summary">{result.summary}</div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # DimensΓ΅es | |
| r_col, d_col = st.columns([1, 2], gap="large") | |
| with r_col: | |
| st.markdown(gauge_svg(result.overall_score, size=160), unsafe_allow_html=True) | |
| st.markdown(f""" | |
| <div style='text-align:center;margin-top:0.5rem'> | |
| <div style='font-family:DM Mono,monospace;font-size:0.65rem; | |
| color:#8a8880;text-transform:uppercase;letter-spacing:0.1em'> | |
| Score Geral | |
| </div> | |
| <div style='font-family:DM Mono,monospace;font-size:0.72rem; | |
| color:#4a4840;margin-top:0.3rem'>{model_label or "LLM"}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Mini radar de pesos | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| for dim in result.dimensions: | |
| tier = score_tier(dim.score) | |
| st.markdown(f""" | |
| <div style='display:flex;justify-content:space-between; | |
| align-items:center;margin:0.3rem 0'> | |
| <span style='font-family:DM Mono,monospace;font-size:0.65rem; | |
| color:#8a8880;text-transform:uppercase'>{dim.key[:8]}</span> | |
| <span class='score-chip score-{tier}'>{dim.score:.1f}</span> | |
| </div> | |
| <div class='prog-track'> | |
| <div class='prog-fill-{tier}' style='width:{score_pct(dim.score)}'></div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with d_col: | |
| DESCS = { | |
| "faithfulness": "Resposta fiel ao contexto? Sem alucinaΓ§Γ΅es?", | |
| "relevance": "Responde diretamente Γ pergunta?", | |
| "completeness": "Cobre todos os aspectos relevantes?", | |
| "conciseness": "Direta, sem verbosidade desnecessΓ‘ria?", | |
| "hallucination": "Detecta afirmaΓ§Γ΅es nΓ£o suportadas.", | |
| } | |
| for dim in result.dimensions: | |
| tier = score_tier(dim.score) | |
| issues_html = "" | |
| for issue in dim.issues[:3]: | |
| issues_html += f'<li class="issue-item">{issue}</li>' | |
| st.markdown(f""" | |
| <div class="dim-card dim-{tier}"> | |
| <div class="dim-header"> | |
| <div> | |
| <div class="dim-name">{dim.name}</div> | |
| <div class="dim-desc">{DESCS.get(dim.key,'')}</div> | |
| </div> | |
| <span class="score-chip score-{tier}">{dim.score:.1f}</span> | |
| </div> | |
| <div class="prog-track"> | |
| <div class="prog-fill-{tier}" style="width:{score_pct(dim.score)}"></div> | |
| </div> | |
| <div class="dim-reasoning">{dim.reasoning}</div> | |
| {f'<ul class="issue-list">{issues_html}</ul>' if issues_html else ''} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 2 β COMPARAR MODELOS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_compare: | |
| if len(st.session_state.results) < 2: | |
| st.markdown(""" | |
| <div style='text-align:center;padding:3rem;color:#8a8880; | |
| font-family:DM Mono,monospace;font-size:0.8rem'> | |
| Avalie pelo menos 2 respostas para comparar modelos.<br> | |
| Use a aba "Avaliar Resposta" com labels de modelo diferentes. | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| results = st.session_state.results[-6:] # ΓΊltimos 6 | |
| st.markdown(""" | |
| <div style='font-family:DM Mono,monospace;font-size:0.65rem; | |
| color:#8a8880;text-transform:uppercase;letter-spacing:0.15em; | |
| margin-bottom:1rem'>β ComparaΓ§Γ£o de resultados recentes</div> | |
| """, unsafe_allow_html=True) | |
| # Gauges lado a lado | |
| gcols = st.columns(min(len(results), 4)) | |
| for i, r in enumerate(results[-4:]): | |
| with gcols[i]: | |
| tier = score_tier(r.overall_score) | |
| st.markdown(gauge_svg(r.overall_score, size=120), unsafe_allow_html=True) | |
| st.markdown(f""" | |
| <div style='text-align:center'> | |
| <div style='font-family:DM Mono,monospace;font-size:0.65rem; | |
| color:#4a4840;overflow:hidden;text-overflow:ellipsis; | |
| white-space:nowrap;max-width:120px;margin:0 auto'> | |
| {r.model_label} | |
| </div> | |
| <div class='score-chip score-{tier}' style='font-size:0.8rem'> | |
| {r.verdict} | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| # Tabela comparativa | |
| dim_keys = ["faithfulness", "relevance", "completeness", "hallucination", "conciseness"] | |
| dim_names = { | |
| "faithfulness": "Faithfulness", "relevance": "Relevance", | |
| "completeness": "Completeness", "hallucination": "Hallucination", | |
| "conciseness": "Conciseness", | |
| } | |
| header_cols = ["DimensΓ£o"] + [r.model_label[:16] for r in results[-4:]] | |
| table_html = '<div style="overflow-x:auto">' | |
| # Header | |
| table_html += '<div class="compare-row">' | |
| table_html += '<div class="compare-dim">DimensΓ£o</div>' | |
| for r in results[-4:]: | |
| table_html += f'<div style="font-family:DM Mono,monospace;font-size:0.65rem;color:#8a8880;text-transform:uppercase;text-align:center">{r.model_label[:16]}</div>' | |
| table_html += '</div>' | |
| # Rows | |
| for dk in dim_keys: | |
| row_scores = [] | |
| for r in results[-4:]: | |
| for d in r.dimensions: | |
| if d.key == dk: | |
| row_scores.append(d.score) | |
| break | |
| max_s = max(row_scores) if row_scores else 0 | |
| table_html += '<div class="compare-row">' | |
| table_html += f'<div class="compare-dim">{dim_names[dk]}</div>' | |
| for s in row_scores: | |
| tier = score_tier(s) | |
| is_winner = s == max_s | |
| table_html += f'<div class="compare-score score-{tier} {"winner-cell" if is_winner else ""}">{s:.1f}</div>' | |
| table_html += '</div>' | |
| # Overall | |
| overall_scores = [r.overall_score for r in results[-4:]] | |
| max_ov = max(overall_scores) | |
| table_html += '<div class="compare-row" style="border-top:2px solid #1a1814;font-weight:700">' | |
| table_html += '<div class="compare-dim" style="color:#1a1814;font-weight:700">OVERALL</div>' | |
| for s in overall_scores: | |
| tier = score_tier(s) | |
| is_winner = s == max_ov | |
| table_html += f'<div class="compare-score score-{tier} {"winner-cell" if is_winner else ""}" style="font-size:1.3rem">{s:.1f}</div>' | |
| table_html += '</div>' | |
| table_html += '</div>' | |
| st.markdown(table_html, unsafe_allow_html=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 3 β HISTΓRICO | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_history: | |
| if not st.session_state.results: | |
| st.markdown(""" | |
| <div style='text-align:center;padding:3rem;color:#8a8880; | |
| font-family:DM Mono,monospace;font-size:0.8rem'> | |
| Nenhuma avaliaΓ§Γ£o executada ainda. | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| for i, r in enumerate(reversed(st.session_state.results)): | |
| tier = score_tier(r.overall_score) | |
| with st.expander( | |
| f"#{len(st.session_state.results)-i} Β· {r.model_label} Β· {r.overall_score:.1f}/10 Β· {r.verdict}", | |
| expanded=(i == 0) | |
| ): | |
| st.markdown(f""" | |
| <div style='font-family:DM Mono,monospace;font-size:0.72rem; | |
| color:#4a4840;margin-bottom:0.5rem'> | |
| <b>Q:</b> {r.question} | |
| </div> | |
| <div style='font-family:DM Mono,monospace;font-size:0.72rem; | |
| color:#8a8880;margin-bottom:0.8rem'> | |
| <b>A:</b> {r.answer[:200]}{'...' if len(r.answer)>200 else ''} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| dim_cols = st.columns(5) | |
| for j, dim in enumerate(r.dimensions): | |
| dtier = score_tier(dim.score) | |
| with dim_cols[j]: | |
| st.markdown(f""" | |
| <div style='text-align:center'> | |
| <div style='font-family:DM Mono,monospace;font-size:0.6rem; | |
| color:#8a8880;text-transform:uppercase'>{dim.key[:8]}</div> | |
| <div class='score-chip score-{dtier}'>{dim.score:.1f}</div> | |
| </div> | |
| """, unsafe_allow_html=True) |