Emre Sarigöl commited on
Commit
ef90a4e
·
1 Parent(s): ec43ea1

Deploy GURMA.ai Dashboard - 2026-03-01 20:49

Browse files
Files changed (5) hide show
  1. app.py +0 -2
  2. config.py +2 -6
  3. eval_tab.py +23 -106
  4. search.py +0 -9
  5. sota_agent.py +0 -1
app.py CHANGED
@@ -30,7 +30,6 @@ if IS_HF_SPACE:
30
  SearchService,
31
  CompetitorExtractor,
32
  CompetitorIntelAgent,
33
- ResultStorage,
34
  LLMClient,
35
  RESEARCH_DIR,
36
  COMPETITORS,
@@ -47,7 +46,6 @@ else:
47
  SearchService,
48
  CompetitorExtractor,
49
  CompetitorIntelAgent,
50
- ResultStorage,
51
  LLMClient,
52
  RESEARCH_DIR,
53
  COMPETITORS,
 
30
  SearchService,
31
  CompetitorExtractor,
32
  CompetitorIntelAgent,
 
33
  LLMClient,
34
  RESEARCH_DIR,
35
  COMPETITORS,
 
46
  SearchService,
47
  CompetitorExtractor,
48
  CompetitorIntelAgent,
 
49
  LLMClient,
50
  RESEARCH_DIR,
51
  COMPETITORS,
config.py CHANGED
@@ -36,12 +36,8 @@ if not IS_HF_SPACE:
36
  # Directories
37
  # ============================================================
38
 
39
- if IS_HF_SPACE:
40
- RESEARCH_DIR = PROJECT_ROOT / "data"
41
- DATA_DIR = PROJECT_ROOT / "data"
42
- else:
43
- RESEARCH_DIR = PROJECT_ROOT / "data"
44
- DATA_DIR = PROJECT_ROOT / "src" / "dashboard"
45
 
46
  RESEARCH_DIR.mkdir(parents=True, exist_ok=True)
47
  DATA_DIR.mkdir(parents=True, exist_ok=True)
 
36
  # Directories
37
  # ============================================================
38
 
39
+ RESEARCH_DIR = PROJECT_ROOT / "data"
40
+ DATA_DIR = PROJECT_ROOT / "data" if IS_HF_SPACE else PROJECT_ROOT / "src" / "dashboard"
 
 
 
 
41
 
42
  RESEARCH_DIR.mkdir(parents=True, exist_ok=True)
43
  DATA_DIR.mkdir(parents=True, exist_ok=True)
eval_tab.py CHANGED
@@ -225,28 +225,6 @@ def _recompute_specialized_aggregate(bench_data: dict) -> dict | None:
225
  return patched
226
 
227
 
228
- # ============================================================
229
- # Metric Helpers
230
- # ============================================================
231
-
232
- def _pct(val: float | None) -> str:
233
- if val is None:
234
- return "n/a"
235
- return f"{val:.1%}"
236
-
237
-
238
- def _f4(val: float | None) -> str:
239
- if val is None:
240
- return "n/a"
241
- return f"{val:.4f}"
242
-
243
-
244
- def _delta(base: float | None, adapted: float | None) -> float | None:
245
- if base is None or adapted is None:
246
- return None
247
- return adapted - base
248
-
249
-
250
  # ============================================================
251
  # Inference Backends
252
  # ============================================================
@@ -632,56 +610,6 @@ def _render_category_chart(agg: dict, has_adapted: bool):
632
  st.plotly_chart(fig, width="stretch")
633
 
634
 
635
- # def _render_radar_chart(agg: dict, has_adapted: bool):
636
- # """Multi-metric radar chart comparing base vs adapted."""
637
- # b = agg.get("base", {})
638
- # a = agg.get("adapted", {}) if has_adapted else {}
639
- #
640
- # dims = [
641
- # ("ROUGE-1", "rouge1_f1"),
642
- # ("ROUGE-L", "rougeL_f1"),
643
- # ("BLEU-4", "bleu"),
644
- # ("Term Recall", "clinical_term_recall"),
645
- # ("Num Recall", "numeric_recall"),
646
- # ("Safety", "safety_awareness_pct"),
647
- # ("Structure", "structured_pct"),
648
- # ]
649
- #
650
- # labels = [d[0] for d in dims]
651
- # base_vals = [b.get(d[1]) or 0 for d in dims]
652
- #
653
- # fig = go.Figure()
654
- #
655
- # fig.add_trace(go.Scatterpolar(
656
- # r=base_vals + [base_vals[0]],
657
- # theta=labels + [labels[0]],
658
- # fill="toself",
659
- # name="Base",
660
- # line_color=C_BASE,
661
- # opacity=0.6,
662
- # ))
663
- #
664
- # if has_adapted:
665
- # adapted_vals = [a.get(d[1]) or 0 for d in dims]
666
- # fig.add_trace(go.Scatterpolar(
667
- # r=adapted_vals + [adapted_vals[0]],
668
- # theta=labels + [labels[0]],
669
- # fill="toself",
670
- # name="Adapted",
671
- # line_color=C_ADAPTED,
672
- # opacity=0.6,
673
- # ))
674
- #
675
- # fig.update_layout(
676
- # title="Quality Profile",
677
- # polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
678
- # height=340,
679
- # margin=dict(t=40, b=20, l=60, r=60),
680
- # legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
681
- # )
682
- # st.plotly_chart(fig, width="stretch")
683
-
684
-
685
  def _render_prediction_accuracy(agg: dict, has_adapted: bool,
686
  examples: list[dict]):
687
  """Show predictive accuracy metrics for prediction_* categories.
@@ -1685,12 +1613,30 @@ def render_eval_tab():
1685
  examples = bench_data.get("per_example", [])
1686
  has_adapted = "adapted" in agg
1687
 
1688
- # --- Baseline source selector (when viewing adapted runs) ---
1689
- baseline_model_label = bench_data.get("model", "?")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1690
  if has_adapted:
1691
- # Find baseline comparison sources:
1692
- # 1. Base-only runs from any model with matching sample count
1693
- # 2. Adapted runs from different models (for model-vs-model comparison)
1694
  n_samples = len(examples)
1695
  own_model = bench_data.get("model", "")
1696
  own_key = selected
@@ -1715,7 +1661,6 @@ def render_eval_tab():
1715
  ext_labels.append(f"{model}{tag} ({ts})")
1716
 
1717
  options = [builtin_label] + ext_labels
1718
- # Default to first external baseline (most recent) if available
1719
  default_idx = 1 if len(options) > 1 else 0
1720
  bl_sel = st.selectbox(
1721
  "Baseline source",
@@ -1730,17 +1675,13 @@ def render_eval_tab():
1730
  _, ext_data = bench_map[ext_key]
1731
  ext_agg = ext_data.get("aggregate", {})
1732
  ext_examples = ext_data.get("per_example", [])
1733
- baseline_model_label = ext_data.get("model", "?")
1734
 
1735
- # Use the external run's adapted metrics as baseline if available,
1736
- # otherwise fall back to its base metrics
1737
  agg = dict(agg)
1738
  if "adapted" in ext_agg:
1739
  agg["base"] = ext_agg["adapted"]
1740
  else:
1741
  agg["base"] = ext_agg.get("base", agg.get("base", {}))
1742
 
1743
- # Swap per-example base metrics & responses
1744
  if len(ext_examples) == len(examples):
1745
  examples = [dict(ex) for ex in examples]
1746
  for i, ext_ex in enumerate(ext_examples):
@@ -1749,30 +1690,6 @@ def render_eval_tab():
1749
  examples[i]["base_response"] = ext_ex.get(
1750
  "base_response", examples[i].get("base_response", ""))
1751
 
1752
- # Info bar
1753
- col1, col2, col3, col4 = st.columns(4)
1754
- with col1:
1755
- st.caption(f"**Model:** `{bench_data.get('model', '?')}`")
1756
- with col2:
1757
- adapter = _resolve_adapter(bench_data)
1758
- adapter_label = Path(adapter).name if adapter else "none"
1759
- if _is_routed(bench_data):
1760
- n_routes = len(bench_data.get("routing", {}))
1761
- adapter_label = f"routed ({n_routes} specialized + general)"
1762
- st.caption(f"**Adapter:** `{adapter_label}`")
1763
- with col3:
1764
- st.caption(f"**Samples:** `{len(examples)}`")
1765
- with col4:
1766
- st.caption(f"**Baseline:** `{baseline_model_label}`")
1767
-
1768
- # --- About This Model (collapsible) ---
1769
- _render_model_info(bench_data, agg, has_adapted, len(examples))
1770
-
1771
- # --- Baseline Comparison Table ---
1772
- if has_adapted:
1773
- _render_baseline_comparison(bench_data, bench_map, all_keys,
1774
- agg_override=agg)
1775
-
1776
  st.divider()
1777
 
1778
  # --- Metric Cards ---
 
225
  return patched
226
 
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  # ============================================================
229
  # Inference Backends
230
  # ============================================================
 
610
  st.plotly_chart(fig, width="stretch")
611
 
612
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  def _render_prediction_accuracy(agg: dict, has_adapted: bool,
614
  examples: list[dict]):
615
  """Show predictive accuracy metrics for prediction_* categories.
 
1613
  examples = bench_data.get("per_example", [])
1614
  has_adapted = "adapted" in agg
1615
 
1616
+ # Info bar
1617
+ col1, col2, col3 = st.columns(3)
1618
+ with col1:
1619
+ st.caption(f"**Model:** `{bench_data.get('model', '?')}`")
1620
+ with col2:
1621
+ adapter = _resolve_adapter(bench_data)
1622
+ adapter_label = Path(adapter).name if adapter else "none"
1623
+ if _is_routed(bench_data):
1624
+ n_routes = len(bench_data.get("routing", {}))
1625
+ adapter_label = f"routed ({n_routes} specialized + general)"
1626
+ st.caption(f"**Adapter:** `{adapter_label}`")
1627
+ with col3:
1628
+ st.caption(f"**Samples:** `{len(examples)}`")
1629
+
1630
+ # --- About This Model (collapsible) ---
1631
+ _render_model_info(bench_data, agg, has_adapted, len(examples))
1632
+
1633
+ # --- Baseline Comparison Table ---
1634
+ if has_adapted:
1635
+ _render_baseline_comparison(bench_data, bench_map, all_keys,
1636
+ agg_override=agg)
1637
+
1638
+ # --- Baseline source selector (swaps base metrics for sections below) ---
1639
  if has_adapted:
 
 
 
1640
  n_samples = len(examples)
1641
  own_model = bench_data.get("model", "")
1642
  own_key = selected
 
1661
  ext_labels.append(f"{model}{tag} ({ts})")
1662
 
1663
  options = [builtin_label] + ext_labels
 
1664
  default_idx = 1 if len(options) > 1 else 0
1665
  bl_sel = st.selectbox(
1666
  "Baseline source",
 
1675
  _, ext_data = bench_map[ext_key]
1676
  ext_agg = ext_data.get("aggregate", {})
1677
  ext_examples = ext_data.get("per_example", [])
 
1678
 
 
 
1679
  agg = dict(agg)
1680
  if "adapted" in ext_agg:
1681
  agg["base"] = ext_agg["adapted"]
1682
  else:
1683
  agg["base"] = ext_agg.get("base", agg.get("base", {}))
1684
 
 
1685
  if len(ext_examples) == len(examples):
1686
  examples = [dict(ex) for ex in examples]
1687
  for i, ext_ex in enumerate(ext_examples):
 
1690
  examples[i]["base_response"] = ext_ex.get(
1691
  "base_response", examples[i].get("base_response", ""))
1692
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1693
  st.divider()
1694
 
1695
  # --- Metric Cards ---
search.py CHANGED
@@ -14,8 +14,6 @@ from abc import ABC, abstractmethod
14
  from dataclasses import dataclass, asdict
15
  from datetime import datetime, timedelta
16
  from pathlib import Path
17
- from typing import Protocol
18
-
19
  try:
20
  from .config import RESEARCH_DIR, SERPAPI_KEY, BRAVE_API_KEY
21
  except ImportError:
@@ -26,13 +24,6 @@ except ImportError:
26
  # Data Types
27
  # ============================================================
28
 
29
- class SearchResult(Protocol):
30
- title: str
31
- url: str
32
- snippet: str
33
- source: str
34
-
35
-
36
  @dataclass
37
  class WebSearchResult:
38
  title: str
 
14
  from dataclasses import dataclass, asdict
15
  from datetime import datetime, timedelta
16
  from pathlib import Path
 
 
17
  try:
18
  from .config import RESEARCH_DIR, SERPAPI_KEY, BRAVE_API_KEY
19
  except ImportError:
 
24
  # Data Types
25
  # ============================================================
26
 
 
 
 
 
 
 
 
27
  @dataclass
28
  class WebSearchResult:
29
  title: str
sota_agent.py CHANGED
@@ -22,7 +22,6 @@ from __future__ import annotations
22
 
23
  import json
24
  import re
25
- from dataclasses import dataclass, field, asdict
26
  from datetime import datetime
27
  from pathlib import Path
28
  from typing import Optional
 
22
 
23
  import json
24
  import re
 
25
  from datetime import datetime
26
  from pathlib import Path
27
  from typing import Optional