Fourwheels2512's picture
Zero forgetting benchmark dashboard
736d089
"""ModelBrew AI β€” Zero Forgetting Benchmark Results Dashboard"""
import gradio as gr
import plotly.graph_objects as go
# ── Brand colors ──
BLUE = "#1F4E79"
GREEN = "#4CAF50"
RED = "#E53935"
GOLD = "#F9A825"
GRAY = "#757575"
# ── Data ──
SEEDS = ["Seed 0", "Seed 42", "Seed 1234"]
NAIVE_FORGET = [38.1, 41.7, 49.0]
MODELBREW_DRIFT = [-0.03, -0.10, -0.37]
WALMART_DOMAINS = ["Customer Service", "Product Knowledge", "HR Policy", "Financial Analytics"]
WALMART_GEN = [0.92, 0.94, 0.88, 0.83]
WALMART_RET = [None, 0.83, 0.86, 0.82]
SF_DOMAINS = ["CRM Ops", "Sales Ops", "Reporting", "Support", "Admin & Dev"]
SF_GEN = [0.882, 0.897, 0.890, 0.885, 0.897]
SF_RET = [None, 0.889, 0.891, 0.897, 0.907]
SF_GN = [3.68, 2.15, 3.16, 2.53, 2.11]
SF_LOSS = [1.33, 1.05, 1.24, 0.96, 0.66]
DENTAL_DOMAINS = [f"Domain {i+1}" for i in range(8)]
DENTAL_MB_GN = [3.8, 4.2, 5.1, 4.5, 5.5, 4.8, 6.1, 5.2]
DENTAL_NAIVE_GN = [4.8, 5.6, 6.3, 6.9, 7.2, 8.1, 8.8, 9.4]
def make_seed_chart():
fig = go.Figure()
fig.add_trace(go.Bar(
name="Naive LoRA (forgetting)",
x=SEEDS, y=NAIVE_FORGET,
marker_color=RED,
text=[f"+{v}%" for v in NAIVE_FORGET],
textposition="outside",
))
fig.add_trace(go.Bar(
name="ModelBrew (drift)",
x=SEEDS, y=[abs(v) for v in MODELBREW_DRIFT],
marker_color=GREEN,
text=[f"{v}%" for v in MODELBREW_DRIFT],
textposition="outside",
))
fig.update_layout(
title="Benchmark 1: Multi-Seed Research β€” 5 Domains on Mistral-7B",
yaxis_title="Knowledge Lost (%)",
barmode="group",
template="plotly_white",
height=450,
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
)
return fig
def make_walmart_chart():
fig = go.Figure()
fig.add_trace(go.Bar(
name="Gen BERTScore",
x=WALMART_DOMAINS, y=WALMART_GEN,
marker_color=BLUE,
text=[f"{v:.2f}" for v in WALMART_GEN],
textposition="outside",
))
ret_vals = [v if v else 0 for v in WALMART_RET]
ret_text = [f"{v:.2f}" if v else "β€”" for v in WALMART_RET]
fig.add_trace(go.Bar(
name="Retention BERTScore",
x=WALMART_DOMAINS, y=ret_vals,
marker_color=GREEN,
text=ret_text,
textposition="outside",
))
fig.update_layout(
title="Benchmark 2: Walmart Enterprise β€” 4 Domains on Mistral-7B",
yaxis_title="BERTScore",
yaxis_range=[0.5, 1.0],
barmode="group",
template="plotly_white",
height=450,
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
)
return fig
def make_salesforce_chart():
fig = go.Figure()
fig.add_trace(go.Scatter(
name="Gen BERTScore",
x=SF_DOMAINS, y=SF_GEN,
mode="lines+markers+text",
marker=dict(size=10, color=BLUE),
text=[f"{v:.3f}" for v in SF_GEN],
textposition="top center",
))
ret_x = SF_DOMAINS[1:]
ret_y = [v for v in SF_RET if v]
fig.add_trace(go.Scatter(
name="Retention BERTScore",
x=ret_x, y=ret_y,
mode="lines+markers+text",
marker=dict(size=10, color=GREEN),
text=[f"{v:.3f}" for v in ret_y],
textposition="bottom center",
line=dict(dash="dot"),
))
fig.update_layout(
title="Benchmark 3: Salesforce Enterprise β€” 5 Domains, Positive Backward Transfer",
yaxis_title="BERTScore",
yaxis_range=[0.85, 0.92],
template="plotly_white",
height=450,
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
)
return fig
def make_salesforce_gn_chart():
fig = go.Figure()
fig.add_trace(go.Bar(
name="Peak Gradient Norm",
x=SF_DOMAINS, y=SF_GN,
marker_color=BLUE,
text=[f"{v:.2f}" for v in SF_GN],
textposition="outside",
))
fig.add_hline(y=263, line_dash="dash", line_color=RED,
annotation_text="Naive LoRA: 263 (crashed)", annotation_position="top left")
fig.update_layout(
title="Salesforce β€” Gradient Stability (Naive LoRA crashed at 263)",
yaxis_title="Peak Gradient Norm",
template="plotly_white",
height=400,
)
return fig
def make_dental_chart():
fig = go.Figure()
fig.add_trace(go.Scatter(
name="Naive LoRA",
x=DENTAL_DOMAINS, y=DENTAL_NAIVE_GN,
mode="lines+markers",
marker=dict(size=8, color=RED),
line=dict(width=2),
))
fig.add_trace(go.Scatter(
name="ModelBrew",
x=DENTAL_DOMAINS, y=DENTAL_MB_GN,
mode="lines+markers",
marker=dict(size=8, color=GREEN),
line=dict(width=2),
))
fig.update_layout(
title="Benchmark 4: Dental Stress Test β€” 8 Sequential Domains",
yaxis_title="Peak Gradient Norm",
template="plotly_white",
height=450,
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
)
return fig
def make_summary_chart():
experiments = ["Research\n(5 domains)", "Walmart\n(4 domains)", "Salesforce\n(5 domains)", "Dental\n(8 domains)"]
naive = [43.0, None, None, None]
ours = [0.17, 0, 0, 0]
fig = go.Figure()
fig.add_trace(go.Bar(
name="Naive LoRA Forgetting",
x=["Research\n(5 domains)"], y=[43.0],
marker_color=RED,
text=["+43.0%"],
textposition="outside",
width=0.3,
))
fig.add_trace(go.Bar(
name="ModelBrew Drift",
x=experiments, y=[0.17, 0, 0, 0],
marker_color=GREEN,
text=["-0.17%", "Zero", "Zero\n(positive transfer)", "Zero"],
textposition="outside",
width=0.3,
))
fig.update_layout(
title="Zero Forgetting Across All 4 Benchmarks",
yaxis_title="Knowledge Lost (%)",
barmode="group",
template="plotly_white",
height=450,
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
)
return fig
OVERVIEW_MD = """
# Zero Forgetting in LLM Fine-Tuning
**Every fine-tuning run destroys what the model already knew.** Train on medical, then legal β€” medical is gone.
ModelBrew is a continual learning adapter (~0.1% additional parameters) that solves catastrophic forgetting. Train one model on domain after domain β€” **it keeps everything.**
---
### 4 Benchmarks on Mistral-7B. Zero Forgetting. Every Single One.
| Benchmark | Domains | Seeds | Result |
|-----------|---------|-------|--------|
| **Research** | 5 (Medical β†’ Legal β†’ Financial β†’ Code β†’ Science) | 3 | **-0.17% drift** vs +43% naive forgetting |
| **Walmart** | 4 (Customer Service β†’ Product β†’ HR β†’ Finance) | 1 | **BERTScores 0.82–0.94** across all domains |
| **Salesforce** | 5 (CRM β†’ Sales β†’ Reporting β†’ Support β†’ Admin) | 1 | **Positive backward transfer** (0.889 β†’ 0.907) |
| **Dental** | 8 sequential domains | 2 | **Gradient norms stable**, zero explosions |
- Spectral norm locked at **1.0** across every experiment
- Naive LoRA crashed at step 43 with gradient norm **263**. Ours: peak under **6**
- No replay buffers. No EWC. No knowledge distillation. No retraining from scratch.
---
### What This Means
Right now every AI team in the world throws away learned knowledge every time they fine-tune. That's billions of dollars in wasted compute and a fundamental barrier to AI that actually builds on what it knows over time.
- A hospital trains one model across radiology, pathology, cardiology β€” it keeps learning, never forgets
- A legal AI learns new case law without losing old precedent
- Models in developing countries accumulate knowledge across languages and domains on limited hardware
---
### What's Shipped
- **Live product** β€” processing real training runs today
- **196 automated tests** β€” CI pipeline on GitHub Actions
- **US patent pending** β€” provisional filed February 2026
- **7 technical reports** β€” from 50+ failed experiments to the working method
- **Free tier** β€” try it right now, no credit card needed
Google published Nested Learning at NeurIPS 2025. Meta has Sparse Memory Finetuning. Neither is available to use. **This is.**
---
**[Try it live](https://mhc-finetune-saas-zrtokzlkbnue9zsk7jfgad.streamlit.app)** | **[API](https://fourwheels2512--crma-finetune-fastapi-app.modal.run/docs)** | **Patent Pending (US Provisional, Feb 2026)**
*Kiran Nayudu β€” ModelBrew AI β€” fourwheels2512@gmail.com*
"""
# ── Build app ──
with gr.Blocks(
title="ModelBrew AI β€” Zero Forgetting Benchmarks",
) as demo:
gr.Markdown("# ModelBrew AI β€” Zero Forgetting Benchmark Results")
gr.Markdown("*4 independent benchmarks on Mistral-7B. Zero catastrophic forgetting across all of them.*")
with gr.Tabs():
with gr.Tab("Overview"):
gr.Markdown(OVERVIEW_MD)
gr.Plot(make_summary_chart)
with gr.Tab("Research Benchmark"):
gr.Markdown("""
### Multi-Seed Research β€” 5 Domains, 3 Seeds
Medical β†’ Legal β†’ Financial β†’ Code β†’ Science on Mistral-7B.
Repeated across 3 random seeds to confirm reproducibility.
Naive LoRA destroyed **38–49%** of prior knowledge with every new domain.
ModelBrew drifted less than **0.4%**. The negative sign means the model actually *improved* on old domains.
Naive LoRA **crashed at step 43** with gradient norm 263.
ModelBrew completed every run with peak gradient norm under 6. Spectral norm locked at 1.0.
""")
gr.Plot(make_seed_chart)
with gr.Tab("Walmart Enterprise"):
gr.Markdown("""
### Walmart Enterprise β€” 4 Domains
Customer Service β†’ Product Knowledge β†’ HR Policy β†’ Financial Analytics.
One model. Four enterprise domains. All retained.
The final model answers questions across all four with **BERTScores of 0.82–0.94**.
""")
gr.Plot(make_walmart_chart)
with gr.Tab("Salesforce Enterprise"):
gr.Markdown("""
### Salesforce Enterprise β€” 5 Domains, Cumulative Adapter
CRM Operations β†’ Sales Ops β†’ Reporting & Analytics β†’ Customer Support β†’ Admin & Dev.
Retention BERTScores went **UP** with each new domain β€” 0.889 β†’ 0.891 β†’ 0.897 β†’ 0.907.
The model gets *better* at old domains as it learns new ones. **Positive backward transfer.**
Peak gradient norms stayed between **2.1 and 3.7**. Zero gradient explosions.
""")
gr.Plot(make_salesforce_chart)
gr.Plot(make_salesforce_gn_chart)
with gr.Tab("Dental Stress Test"):
gr.Markdown("""
### Dental Stress Test β€” 8 Sequential Domains, 2 Seeds
The longest chain we've tested. Eight sequential domains on Mistral-7B.
Peak gradient norms stayed between **3.8 and 6.1** across all 8 domains.
Naive LoRA gradient norms grew monotonically to **9.4**.
Spectral norm: **1.0** throughout. Zero crashes. Zero NaN losses.
""")
gr.Plot(make_dental_chart)
with gr.Tab("Salesforce Details"):
gr.Markdown("""
### Salesforce β€” Full Per-Domain Breakdown
| Domain | Training Loss | Gen BERTScore | Retention BERTScore | Peak Grad Norm |
|--------|:---:|:---:|:---:|:---:|
| 1. CRM Operations | 1.33 | 0.882 | β€” | 3.68 |
| 2. Sales Ops | 1.05 | 0.897 | 0.889 | 2.15 |
| 3. Reporting & Analytics | 1.24 | 0.890 | 0.891 | 3.16 |
| 4. Customer Support | 0.96 | 0.885 | 0.897 | 2.53 |
| 5. Admin & Dev | 0.66 | 0.897 | 0.907 | 2.11 |
**Key findings:**
- Retention BERTScores *improved* as domains accumulated β€” evidence of positive backward transfer
- Training loss decreased across domains (1.33 β†’ 0.66) β€” the model learns faster with more accumulated knowledge
- Peak gradient norms stayed between 2.1–3.7 β€” zero gradient explosions, zero NaN losses
- Final adapter answers questions from all 5 Salesforce domains
""")
gr.Markdown("---")
gr.Markdown(
"*ModelBrew AI β€” Patent Pending (US Provisional, Feb 2026) β€” "
"[Try it live](https://mhc-finetune-saas-zrtokzlkbnue9zsk7jfgad.streamlit.app) β€” "
"fourwheels2512@gmail.com*"
)
demo.launch(theme=gr.themes.Soft(primary_hue="blue"))