Spaces:
Configuration error
Configuration error
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1"> | |
| <title>Localpager GEPA Reports</title> | |
| <style> | |
| :root{color-scheme:light;--bg:#f6f8fb;--panel:#fff;--ink:#172033;--muted:#667085;--line:#d8dee9;--green:#137333;--red:#b42318;--blue:#185abc;--amber:#ad5f00;--gray:#64748b}*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,Arial,sans-serif;line-height:1.45}header{background:#fff;border-bottom:1px solid var(--line);padding:28px 32px 20px;position:sticky;top:0;z-index:2}main{max-width:1280px;margin:0 auto;padding:24px 32px 52px}h1{margin:0 0 8px;font-size:28px}h2{margin:0 0 14px;font-size:20px}a{color:#0b57d0;text-decoration:none}.muted{color:var(--muted)}nav{display:flex;gap:10px;flex-wrap:wrap;margin-top:14px}nav a,.button{border:1px solid #c9dcff;background:#eef4ff;border-radius:7px;padding:7px 10px;display:inline-block}section{background:var(--panel);border:1px solid var(--line);border-radius:8px;padding:20px;margin-bottom:20px}.gridcards{display:grid;grid-template-columns:repeat(auto-fit,minmax(180px,1fr));gap:12px}.card{border:1px solid var(--line);border-radius:8px;padding:14px;background:#fbfcfe;min-height:104px}.card .label{color:var(--muted);font-size:12px;text-transform:uppercase;letter-spacing:.04em}.card .value{margin-top:8px;font-size:26px;font-weight:750}.card .sub{margin-top:5px;color:var(--muted);font-size:13px}.good{color:var(--green);font-weight:700}.bad{color:var(--red);font-weight:700}.note{border-left:4px solid var(--amber);padding:10px 12px;background:#fff8eb;color:#4a2d00}.chart-wrap{border:1px solid var(--line);border-radius:8px;background:#fff;overflow-x:auto}svg{min-width:980px;width:100%;height:auto;display:block}.axis{stroke:#94a3b8;stroke-width:1.4}.grid{stroke:#e5eaf2}.tick{fill:#667085;font-size:12px}.axis-label{fill:#334155;font-size:13px}.legend{display:flex;gap:16px;flex-wrap:wrap;margin-top:10px;color:#334155;font-size:13px}.legend i{display:inline-block;width:10px;height:10px;border-radius:999px;margin-right:6px;vertical-align:-1px}.panel-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(280px,1fr));gap:16px}.panel-link{display:block;border:1px solid var(--line);border-radius:8px;background:#fbfcfe;padding:16px}.panel-link strong{display:block;color:var(--ink);font-size:16px;margin-bottom:5px}.panel-link span{color:var(--muted);font-size:13px}.scroll{overflow:auto;border:1px solid var(--line);border-radius:8px}table{width:100%;border-collapse:collapse;font-size:13px}th,td{border-bottom:1px solid var(--line);padding:8px 9px;text-align:left;vertical-align:middle}th{background:#f0f3f8;color:#384152;font-weight:700;line-height:1.2;white-space:nowrap}code{font-family:ui-monospace,SFMono-Regular,Consolas,monospace;font-size:12px}@media(max-width:800px){header{position:static}main,header{padding-left:18px;padding-right:18px}} | |
| </style> | |
| </head> | |
| <body> | |
| <header> | |
| <h1>Localpager GEPA Reports</h1> | |
| <div class="muted">Default dashboard for the Evalstate Qwen GEPA overlay run. Updated 2026-06-17.</div> | |
| <nav> | |
| <a href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/final-report.html">Final report</a> | |
| <a href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/whole-dataset-comparison.html">Whole dataset graph</a> | |
| <a href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/proposal-graphs.html">Proposal graphs</a> | |
| <a href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/score_report.html">Iteration graph</a> | |
| <a href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/prompt-diffs/index.html">Prompt diffs</a> | |
| </nav> | |
| </header> | |
| <main> | |
| <section> | |
| <h2>Bottom Line</h2> | |
| <div class="gridcards"> | |
| <div class="card"><div class="label">Full 330 GEPA mean</div><div class="value good">0.7350</div><div class="sub">v10 0.7307, delta +0.0043</div></div> | |
| <div class="card"><div class="label">Full 330 Micro-F1</div><div class="value bad">0.8206</div><div class="sub">v10 0.8231, delta -0.0025</div></div> | |
| <div class="card"><div class="label">Precision / Recall</div><div class="value">0.8246 / 0.8167</div><div class="sub">precision down, recall up</div></div> | |
| <div class="card"><div class="label">FP / FN</div><div class="value">110 / 116</div><div class="sub">v10 102 / 119</div></div> | |
| <div class="card"><div class="label">Heldout Micro-F1</div><div class="value good">0.8417</div><div class="sub">v10 0.8296, delta +0.0121</div></div> | |
| <div class="card"><div class="label">Best Pareto score</div><div class="value good">0.6979</div><div class="sub">seed 0.5742</div></div> | |
| </div> | |
| <p class="note">GEPA-best is not a clear replacement for v10. It improves the GEPA objective and exact match, but false positives increase and full-set micro-F1 is slightly lower.</p> | |
| </section> | |
| <section> | |
| <h2>Whole 330 Score Graph</h2> | |
| <div class="chart-wrap"><svg viewBox="0 0 1080 380" role="img" aria-label="GEPA best versus v10 score comparison"><rect width="100%" height="100%" fill="#fff"/><line x1="70" y1="308.0" x2="1056" y2="308.0" class="grid"/><text x="60" y="312.0" text-anchor="end" class="tick">0.50</text><line x1="70" y1="220.5" x2="1056" y2="220.5" class="grid"/><text x="60" y="224.5" text-anchor="end" class="tick">0.60</text><line x1="70" y1="133.0" x2="1056" y2="133.0" class="grid"/><text x="60" y="137.0" text-anchor="end" class="tick">0.70</text><line x1="70" y1="45.5" x2="1056" y2="45.5" class="grid"/><text x="60" y="49.5" text-anchor="end" class="tick">0.80</text><line x1="70" y1="308" x2="1056" y2="308" class="axis"/><line x1="70" y1="28" x2="70" y2="308" class="axis"/> | |
| <rect x="138" y="106.5" width="54" height="201.5" rx="3" fill="#64748b"><title>v10 GEPA mean 0.7307</title></rect><text x="165" y="99" text-anchor="middle" class="tick">0.7307</text><rect x="201" y="102.7" width="54" height="205.3" rx="3" fill="#2563eb"><title>GEPA best GEPA mean 0.7350</title></rect><text x="228" y="95" text-anchor="middle" class="tick">0.7350</text><text x="196" y="345" text-anchor="middle" class="axis-label">GEPA mean</text> | |
| <rect x="390" y="25.9" width="54" height="282.1" rx="3" fill="#64748b"><title>v10 F1 0.8231</title></rect><text x="417" y="22" text-anchor="middle" class="tick">0.8231</text><rect x="453" y="28.1" width="54" height="279.9" rx="3" fill="#2563eb"><title>GEPA best F1 0.8206</title></rect><text x="480" y="22" text-anchor="middle" class="tick">0.8206</text><text x="448" y="345" text-anchor="middle" class="axis-label">Micro-F1</text> | |
| <rect x="642" y="15.5" width="54" height="292.5" rx="3" fill="#64748b"><title>v10 precision 0.8344</title></rect><text x="669" y="22" text-anchor="middle" class="tick">0.8344</text><rect x="705" y="24.1" width="54" height="283.9" rx="3" fill="#2563eb"><title>GEPA best precision 0.8246</title></rect><text x="732" y="22" text-anchor="middle" class="tick">0.8246</text><text x="700" y="345" text-anchor="middle" class="axis-label">Precision</text> | |
| <rect x="894" y="35.1" width="54" height="272.9" rx="3" fill="#64748b"><title>v10 recall 0.8120</title></rect><text x="921" y="28" text-anchor="middle" class="tick">0.8120</text><rect x="957" y="31.0" width="54" height="277.0" rx="3" fill="#2563eb"><title>GEPA best recall 0.8167</title></rect><text x="984" y="24" text-anchor="middle" class="tick">0.8167</text><text x="952" y="345" text-anchor="middle" class="axis-label">Recall</text> | |
| <text x="563" y="374" text-anchor="middle" class="axis-label">full 330 repaired outputs</text></svg><div class="legend"><span><i style="background:#64748b"></i>v10 seed</span><span><i style="background:#2563eb"></i>GEPA best</span></div></div> | |
| </section> | |
| <section> | |
| <h2>Open The Detailed Graphs</h2> | |
| <div class="panel-grid"> | |
| <a class="panel-link" href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/proposal-graphs.html"><strong>Proposal Graphs</strong><span>Every proposal attempt, accepted/rejected status, subsample deltas, and best-so-far Pareto score.</span></a> | |
| <a class="panel-link" href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/whole-dataset-comparison.html"><strong>Whole Dataset Comparison</strong><span>Full 330-row GEPA-best versus v10 charts and metric table.</span></a> | |
| <a class="panel-link" href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/score_report.html"><strong>Iteration Graph</strong><span>Original GEPA score report for the run.</span></a> | |
| <a class="panel-link" href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/prompt-diffs/index.html"><strong>Prompt Diff Picker</strong><span>Dropdown-to-dropdown comparison for individual candidate prompts.</span></a> | |
| <a class="panel-link" href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/candidate_tree.html"><strong>Candidate Tree</strong><span>GEPA-native candidate tree visualization.</span></a> | |
| <a class="panel-link" href="gepa-evalstate-qwen-overlay-c4-full-20260616T172947Z/final-report.html"><strong>Final Report</strong><span>Run settings, heldout comparison, whole-330 check, and artifact links.</span></a> | |
| </div> | |
| </section> | |
| <section> | |
| <h2>Metric Table</h2> | |
| <div class="scroll"><table><thead><tr><th>Metric</th><th>v10 seed</th><th>GEPA best</th><th>Delta</th></tr></thead><tbody> | |
| <tr><td>GEPA mean score</td><td>0.7307</td><td class="good">0.7350</td><td class="good">+0.0043</td></tr> | |
| <tr><td>Micro-F1</td><td class="good">0.8231</td><td>0.8206</td><td class="bad">-0.0025</td></tr> | |
| <tr><td>Precision</td><td class="good">0.8344</td><td>0.8246</td><td class="bad">-0.0098</td></tr> | |
| <tr><td>Recall</td><td>0.8120</td><td class="good">0.8167</td><td class="good">+0.0047</td></tr> | |
| <tr><td>Exact match</td><td>0.5242</td><td class="good">0.5424</td><td class="good">+0.0182</td></tr> | |
| <tr><td>False positives</td><td class="good">102</td><td>110</td><td class="bad">+8</td></tr> | |
| <tr><td>False negatives</td><td>119</td><td class="good">116</td><td class="good">-3</td></tr> | |
| </tbody></table></div> | |
| </section> | |
| <section> | |
| <h2>Archive</h2> | |
| <p><a class="button" href="final-cardinality-report.html">2026-06-14 12B cardinality report</a> <a class="button" href="gepa-12b-row30-prop20-continuation-20260614T021448Z/score_report.html">2026-06-14 12B score graph</a> <a class="button" href="prompt-diffs/index.html">2026-06-14 prompt diffs</a></p> | |
| </section> | |
| </main> | |
| </body> | |
| </html> | |