Spaces:
Running
Running
Emre Sarigöl commited on
Commit ·
ef90a4e
1
Parent(s): ec43ea1
Deploy GURMA.ai Dashboard - 2026-03-01 20:49
Browse files- app.py +0 -2
- config.py +2 -6
- eval_tab.py +23 -106
- search.py +0 -9
- sota_agent.py +0 -1
app.py
CHANGED
|
@@ -30,7 +30,6 @@ if IS_HF_SPACE:
|
|
| 30 |
SearchService,
|
| 31 |
CompetitorExtractor,
|
| 32 |
CompetitorIntelAgent,
|
| 33 |
-
ResultStorage,
|
| 34 |
LLMClient,
|
| 35 |
RESEARCH_DIR,
|
| 36 |
COMPETITORS,
|
|
@@ -47,7 +46,6 @@ else:
|
|
| 47 |
SearchService,
|
| 48 |
CompetitorExtractor,
|
| 49 |
CompetitorIntelAgent,
|
| 50 |
-
ResultStorage,
|
| 51 |
LLMClient,
|
| 52 |
RESEARCH_DIR,
|
| 53 |
COMPETITORS,
|
|
|
|
| 30 |
SearchService,
|
| 31 |
CompetitorExtractor,
|
| 32 |
CompetitorIntelAgent,
|
|
|
|
| 33 |
LLMClient,
|
| 34 |
RESEARCH_DIR,
|
| 35 |
COMPETITORS,
|
|
|
|
| 46 |
SearchService,
|
| 47 |
CompetitorExtractor,
|
| 48 |
CompetitorIntelAgent,
|
|
|
|
| 49 |
LLMClient,
|
| 50 |
RESEARCH_DIR,
|
| 51 |
COMPETITORS,
|
config.py
CHANGED
|
@@ -36,12 +36,8 @@ if not IS_HF_SPACE:
|
|
| 36 |
# Directories
|
| 37 |
# ============================================================
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
DATA_DIR = PROJECT_ROOT / "data"
|
| 42 |
-
else:
|
| 43 |
-
RESEARCH_DIR = PROJECT_ROOT / "data"
|
| 44 |
-
DATA_DIR = PROJECT_ROOT / "src" / "dashboard"
|
| 45 |
|
| 46 |
RESEARCH_DIR.mkdir(parents=True, exist_ok=True)
|
| 47 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 36 |
# Directories
|
| 37 |
# ============================================================
|
| 38 |
|
| 39 |
+
RESEARCH_DIR = PROJECT_ROOT / "data"
|
| 40 |
+
DATA_DIR = PROJECT_ROOT / "data" if IS_HF_SPACE else PROJECT_ROOT / "src" / "dashboard"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
RESEARCH_DIR.mkdir(parents=True, exist_ok=True)
|
| 43 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
eval_tab.py
CHANGED
|
@@ -225,28 +225,6 @@ def _recompute_specialized_aggregate(bench_data: dict) -> dict | None:
|
|
| 225 |
return patched
|
| 226 |
|
| 227 |
|
| 228 |
-
# ============================================================
|
| 229 |
-
# Metric Helpers
|
| 230 |
-
# ============================================================
|
| 231 |
-
|
| 232 |
-
def _pct(val: float | None) -> str:
|
| 233 |
-
if val is None:
|
| 234 |
-
return "n/a"
|
| 235 |
-
return f"{val:.1%}"
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
def _f4(val: float | None) -> str:
|
| 239 |
-
if val is None:
|
| 240 |
-
return "n/a"
|
| 241 |
-
return f"{val:.4f}"
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
def _delta(base: float | None, adapted: float | None) -> float | None:
|
| 245 |
-
if base is None or adapted is None:
|
| 246 |
-
return None
|
| 247 |
-
return adapted - base
|
| 248 |
-
|
| 249 |
-
|
| 250 |
# ============================================================
|
| 251 |
# Inference Backends
|
| 252 |
# ============================================================
|
|
@@ -632,56 +610,6 @@ def _render_category_chart(agg: dict, has_adapted: bool):
|
|
| 632 |
st.plotly_chart(fig, width="stretch")
|
| 633 |
|
| 634 |
|
| 635 |
-
# def _render_radar_chart(agg: dict, has_adapted: bool):
|
| 636 |
-
# """Multi-metric radar chart comparing base vs adapted."""
|
| 637 |
-
# b = agg.get("base", {})
|
| 638 |
-
# a = agg.get("adapted", {}) if has_adapted else {}
|
| 639 |
-
#
|
| 640 |
-
# dims = [
|
| 641 |
-
# ("ROUGE-1", "rouge1_f1"),
|
| 642 |
-
# ("ROUGE-L", "rougeL_f1"),
|
| 643 |
-
# ("BLEU-4", "bleu"),
|
| 644 |
-
# ("Term Recall", "clinical_term_recall"),
|
| 645 |
-
# ("Num Recall", "numeric_recall"),
|
| 646 |
-
# ("Safety", "safety_awareness_pct"),
|
| 647 |
-
# ("Structure", "structured_pct"),
|
| 648 |
-
# ]
|
| 649 |
-
#
|
| 650 |
-
# labels = [d[0] for d in dims]
|
| 651 |
-
# base_vals = [b.get(d[1]) or 0 for d in dims]
|
| 652 |
-
#
|
| 653 |
-
# fig = go.Figure()
|
| 654 |
-
#
|
| 655 |
-
# fig.add_trace(go.Scatterpolar(
|
| 656 |
-
# r=base_vals + [base_vals[0]],
|
| 657 |
-
# theta=labels + [labels[0]],
|
| 658 |
-
# fill="toself",
|
| 659 |
-
# name="Base",
|
| 660 |
-
# line_color=C_BASE,
|
| 661 |
-
# opacity=0.6,
|
| 662 |
-
# ))
|
| 663 |
-
#
|
| 664 |
-
# if has_adapted:
|
| 665 |
-
# adapted_vals = [a.get(d[1]) or 0 for d in dims]
|
| 666 |
-
# fig.add_trace(go.Scatterpolar(
|
| 667 |
-
# r=adapted_vals + [adapted_vals[0]],
|
| 668 |
-
# theta=labels + [labels[0]],
|
| 669 |
-
# fill="toself",
|
| 670 |
-
# name="Adapted",
|
| 671 |
-
# line_color=C_ADAPTED,
|
| 672 |
-
# opacity=0.6,
|
| 673 |
-
# ))
|
| 674 |
-
#
|
| 675 |
-
# fig.update_layout(
|
| 676 |
-
# title="Quality Profile",
|
| 677 |
-
# polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
|
| 678 |
-
# height=340,
|
| 679 |
-
# margin=dict(t=40, b=20, l=60, r=60),
|
| 680 |
-
# legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 681 |
-
# )
|
| 682 |
-
# st.plotly_chart(fig, width="stretch")
|
| 683 |
-
|
| 684 |
-
|
| 685 |
def _render_prediction_accuracy(agg: dict, has_adapted: bool,
|
| 686 |
examples: list[dict]):
|
| 687 |
"""Show predictive accuracy metrics for prediction_* categories.
|
|
@@ -1685,12 +1613,30 @@ def render_eval_tab():
|
|
| 1685 |
examples = bench_data.get("per_example", [])
|
| 1686 |
has_adapted = "adapted" in agg
|
| 1687 |
|
| 1688 |
-
#
|
| 1689 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1690 |
if has_adapted:
|
| 1691 |
-
# Find baseline comparison sources:
|
| 1692 |
-
# 1. Base-only runs from any model with matching sample count
|
| 1693 |
-
# 2. Adapted runs from different models (for model-vs-model comparison)
|
| 1694 |
n_samples = len(examples)
|
| 1695 |
own_model = bench_data.get("model", "")
|
| 1696 |
own_key = selected
|
|
@@ -1715,7 +1661,6 @@ def render_eval_tab():
|
|
| 1715 |
ext_labels.append(f"{model}{tag} ({ts})")
|
| 1716 |
|
| 1717 |
options = [builtin_label] + ext_labels
|
| 1718 |
-
# Default to first external baseline (most recent) if available
|
| 1719 |
default_idx = 1 if len(options) > 1 else 0
|
| 1720 |
bl_sel = st.selectbox(
|
| 1721 |
"Baseline source",
|
|
@@ -1730,17 +1675,13 @@ def render_eval_tab():
|
|
| 1730 |
_, ext_data = bench_map[ext_key]
|
| 1731 |
ext_agg = ext_data.get("aggregate", {})
|
| 1732 |
ext_examples = ext_data.get("per_example", [])
|
| 1733 |
-
baseline_model_label = ext_data.get("model", "?")
|
| 1734 |
|
| 1735 |
-
# Use the external run's adapted metrics as baseline if available,
|
| 1736 |
-
# otherwise fall back to its base metrics
|
| 1737 |
agg = dict(agg)
|
| 1738 |
if "adapted" in ext_agg:
|
| 1739 |
agg["base"] = ext_agg["adapted"]
|
| 1740 |
else:
|
| 1741 |
agg["base"] = ext_agg.get("base", agg.get("base", {}))
|
| 1742 |
|
| 1743 |
-
# Swap per-example base metrics & responses
|
| 1744 |
if len(ext_examples) == len(examples):
|
| 1745 |
examples = [dict(ex) for ex in examples]
|
| 1746 |
for i, ext_ex in enumerate(ext_examples):
|
|
@@ -1749,30 +1690,6 @@ def render_eval_tab():
|
|
| 1749 |
examples[i]["base_response"] = ext_ex.get(
|
| 1750 |
"base_response", examples[i].get("base_response", ""))
|
| 1751 |
|
| 1752 |
-
# Info bar
|
| 1753 |
-
col1, col2, col3, col4 = st.columns(4)
|
| 1754 |
-
with col1:
|
| 1755 |
-
st.caption(f"**Model:** `{bench_data.get('model', '?')}`")
|
| 1756 |
-
with col2:
|
| 1757 |
-
adapter = _resolve_adapter(bench_data)
|
| 1758 |
-
adapter_label = Path(adapter).name if adapter else "none"
|
| 1759 |
-
if _is_routed(bench_data):
|
| 1760 |
-
n_routes = len(bench_data.get("routing", {}))
|
| 1761 |
-
adapter_label = f"routed ({n_routes} specialized + general)"
|
| 1762 |
-
st.caption(f"**Adapter:** `{adapter_label}`")
|
| 1763 |
-
with col3:
|
| 1764 |
-
st.caption(f"**Samples:** `{len(examples)}`")
|
| 1765 |
-
with col4:
|
| 1766 |
-
st.caption(f"**Baseline:** `{baseline_model_label}`")
|
| 1767 |
-
|
| 1768 |
-
# --- About This Model (collapsible) ---
|
| 1769 |
-
_render_model_info(bench_data, agg, has_adapted, len(examples))
|
| 1770 |
-
|
| 1771 |
-
# --- Baseline Comparison Table ---
|
| 1772 |
-
if has_adapted:
|
| 1773 |
-
_render_baseline_comparison(bench_data, bench_map, all_keys,
|
| 1774 |
-
agg_override=agg)
|
| 1775 |
-
|
| 1776 |
st.divider()
|
| 1777 |
|
| 1778 |
# --- Metric Cards ---
|
|
|
|
| 225 |
return patched
|
| 226 |
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
# ============================================================
|
| 229 |
# Inference Backends
|
| 230 |
# ============================================================
|
|
|
|
| 610 |
st.plotly_chart(fig, width="stretch")
|
| 611 |
|
| 612 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
def _render_prediction_accuracy(agg: dict, has_adapted: bool,
|
| 614 |
examples: list[dict]):
|
| 615 |
"""Show predictive accuracy metrics for prediction_* categories.
|
|
|
|
| 1613 |
examples = bench_data.get("per_example", [])
|
| 1614 |
has_adapted = "adapted" in agg
|
| 1615 |
|
| 1616 |
+
# Info bar
|
| 1617 |
+
col1, col2, col3 = st.columns(3)
|
| 1618 |
+
with col1:
|
| 1619 |
+
st.caption(f"**Model:** `{bench_data.get('model', '?')}`")
|
| 1620 |
+
with col2:
|
| 1621 |
+
adapter = _resolve_adapter(bench_data)
|
| 1622 |
+
adapter_label = Path(adapter).name if adapter else "none"
|
| 1623 |
+
if _is_routed(bench_data):
|
| 1624 |
+
n_routes = len(bench_data.get("routing", {}))
|
| 1625 |
+
adapter_label = f"routed ({n_routes} specialized + general)"
|
| 1626 |
+
st.caption(f"**Adapter:** `{adapter_label}`")
|
| 1627 |
+
with col3:
|
| 1628 |
+
st.caption(f"**Samples:** `{len(examples)}`")
|
| 1629 |
+
|
| 1630 |
+
# --- About This Model (collapsible) ---
|
| 1631 |
+
_render_model_info(bench_data, agg, has_adapted, len(examples))
|
| 1632 |
+
|
| 1633 |
+
# --- Baseline Comparison Table ---
|
| 1634 |
+
if has_adapted:
|
| 1635 |
+
_render_baseline_comparison(bench_data, bench_map, all_keys,
|
| 1636 |
+
agg_override=agg)
|
| 1637 |
+
|
| 1638 |
+
# --- Baseline source selector (swaps base metrics for sections below) ---
|
| 1639 |
if has_adapted:
|
|
|
|
|
|
|
|
|
|
| 1640 |
n_samples = len(examples)
|
| 1641 |
own_model = bench_data.get("model", "")
|
| 1642 |
own_key = selected
|
|
|
|
| 1661 |
ext_labels.append(f"{model}{tag} ({ts})")
|
| 1662 |
|
| 1663 |
options = [builtin_label] + ext_labels
|
|
|
|
| 1664 |
default_idx = 1 if len(options) > 1 else 0
|
| 1665 |
bl_sel = st.selectbox(
|
| 1666 |
"Baseline source",
|
|
|
|
| 1675 |
_, ext_data = bench_map[ext_key]
|
| 1676 |
ext_agg = ext_data.get("aggregate", {})
|
| 1677 |
ext_examples = ext_data.get("per_example", [])
|
|
|
|
| 1678 |
|
|
|
|
|
|
|
| 1679 |
agg = dict(agg)
|
| 1680 |
if "adapted" in ext_agg:
|
| 1681 |
agg["base"] = ext_agg["adapted"]
|
| 1682 |
else:
|
| 1683 |
agg["base"] = ext_agg.get("base", agg.get("base", {}))
|
| 1684 |
|
|
|
|
| 1685 |
if len(ext_examples) == len(examples):
|
| 1686 |
examples = [dict(ex) for ex in examples]
|
| 1687 |
for i, ext_ex in enumerate(ext_examples):
|
|
|
|
| 1690 |
examples[i]["base_response"] = ext_ex.get(
|
| 1691 |
"base_response", examples[i].get("base_response", ""))
|
| 1692 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1693 |
st.divider()
|
| 1694 |
|
| 1695 |
# --- Metric Cards ---
|
search.py
CHANGED
|
@@ -14,8 +14,6 @@ from abc import ABC, abstractmethod
|
|
| 14 |
from dataclasses import dataclass, asdict
|
| 15 |
from datetime import datetime, timedelta
|
| 16 |
from pathlib import Path
|
| 17 |
-
from typing import Protocol
|
| 18 |
-
|
| 19 |
try:
|
| 20 |
from .config import RESEARCH_DIR, SERPAPI_KEY, BRAVE_API_KEY
|
| 21 |
except ImportError:
|
|
@@ -26,13 +24,6 @@ except ImportError:
|
|
| 26 |
# Data Types
|
| 27 |
# ============================================================
|
| 28 |
|
| 29 |
-
class SearchResult(Protocol):
|
| 30 |
-
title: str
|
| 31 |
-
url: str
|
| 32 |
-
snippet: str
|
| 33 |
-
source: str
|
| 34 |
-
|
| 35 |
-
|
| 36 |
@dataclass
|
| 37 |
class WebSearchResult:
|
| 38 |
title: str
|
|
|
|
| 14 |
from dataclasses import dataclass, asdict
|
| 15 |
from datetime import datetime, timedelta
|
| 16 |
from pathlib import Path
|
|
|
|
|
|
|
| 17 |
try:
|
| 18 |
from .config import RESEARCH_DIR, SERPAPI_KEY, BRAVE_API_KEY
|
| 19 |
except ImportError:
|
|
|
|
| 24 |
# Data Types
|
| 25 |
# ============================================================
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
@dataclass
|
| 28 |
class WebSearchResult:
|
| 29 |
title: str
|
sota_agent.py
CHANGED
|
@@ -22,7 +22,6 @@ from __future__ import annotations
|
|
| 22 |
|
| 23 |
import json
|
| 24 |
import re
|
| 25 |
-
from dataclasses import dataclass, field, asdict
|
| 26 |
from datetime import datetime
|
| 27 |
from pathlib import Path
|
| 28 |
from typing import Optional
|
|
|
|
| 22 |
|
| 23 |
import json
|
| 24 |
import re
|
|
|
|
| 25 |
from datetime import datetime
|
| 26 |
from pathlib import Path
|
| 27 |
from typing import Optional
|