query_norm / app.py
vazish's picture
update latency
cce5dde unverified
import time
from pathlib import Path
import pandas as pd
import streamlit as st
st.set_page_config(
page_title="Firefox Query Normalizer",
page_icon="πŸ”",
layout="wide",
)
HERE = Path(__file__).parent
# ─── Normalizer (loaded once, cached across reruns) ───────────────────────────
@st.cache_resource(show_spinner="Loading normalizer…")
def load_normalizer():
from benchmark import CombinedV2Normalizer # noqa: PLC0415
return CombinedV2Normalizer()
# ─── Data ─────────────────────────────────────────────────────────────────────
@st.cache_data
def load_data() -> pd.DataFrame:
df = pd.read_csv(HERE / "results.csv")
df["should_change"] = df["should_change"].astype(bool)
df["em"] = df["em"].astype(bool)
df["outcome"] = df.apply(_classify_outcome, axis=1)
return df
def _classify_outcome(row) -> str:
if row["should_change"]:
if row["em"]:
return "βœ… Fixed correctly"
elif str(row["pred"]).strip().lower() == str(row["noisy"]).strip().lower():
return "❌ Not fixed"
else:
return "⚠️ Fixed incorrectly"
else:
return "βœ… Left unchanged" if row["em"] else "❌ Over-corrected"
CATEGORY_INFO: dict[str, tuple[str, str]] = {
"single_typo": ("✏️ Single Typo", "One misspelled word (e.g. 'wheather' β†’ 'weather')"),
"multi_typo": ("✏️ Multi Typo", "Two or more typos in the same query"),
"brand_typo": ("🏷️ Brand Typo", "Brand name misspelled (e.g. 'bestbuyt' β†’ 'best buy')"),
"flight_order": ("✈️ Flight Order", "Flight number tokens reordered (e.g. '163 SQ' β†’ 'SQ163')"),
"product_order": ("πŸ“± Product Order", "Product tokens reordered (e.g. '15 iphone' β†’ 'iphone 15')"),
"stock_canon": ("πŸ“ˆ Stock Ticker", "Stock query β†’ ticker only (e.g. 'AAPL stock' β†’ 'AAPL')"),
"spacing": ("⎡ Spacing", "Missing spaces fixed (e.g. 'nearme' β†’ 'near me')"),
"no_change": ("πŸ”’ No Change", "Should not be modified β€” tests over-correction resistance"),
}
OUTCOME_ORDER = [
"βœ… Fixed correctly",
"βœ… Left unchanged",
"❌ Not fixed",
"⚠️ Fixed incorrectly",
"❌ Over-corrected",
]
# ─── Header ───────────────────────────────────────────────────────────────────
st.title("πŸ” Query Normalizer")
st.caption("**CombinedV2** pipeline Β· Preprocessing stage for Merino intent classification")
with st.expander("ℹ️ What is this and why does it matter?", expanded=False):
st.markdown("""
Intent detection tries to classify user queries by intents β€”
navigational, local, commercial, etc. β€” to surface the right suggestions.
Real queries are noisy: users make typos, omit spaces, or enter tokens in the
wrong order.
**CombinedV2** is a lightweight rule + dictionary normalizer that runs in **< 1 ms**
per query. It runs 4 steps in sequence and short-circuits as soon as a fix is made:
| Step | What it handles | Example |
|------|----------------|---------|
| **1 Β· Rules** | Flight IDs, stock tickers, product token reordering | `163 SQ` β†’ `SQ163` |
| **2 Β· RapidFuzz** | Fuzzy brand matching (single-token only) | `bestbuyt` β†’ `best buy` |
| **3 Β· SymSpell** | Concatenated word splitting | `nearme` β†’ `near me` |
| **4 Β· GuardedPySpell** | Spell correction (skips ≀4-char tokens & ALL_CAPS) | `wheather nyc` β†’ `weather nyc` |
**Benchmark results across 299 queries in 8 categories:**
| Metric | Score |
|--------|-------|
| Exact match on queries that need fixing | **73.2%** |
| Precision on queries that should NOT change | **98.5%** |
| Median latency (p50) | **0.03 ms** |
""")
st.divider()
# ─── Tabs ─────────────────────────────────────────────────────────────────────
tab_try, tab_browse, tab_perf = st.tabs(["πŸ”€ Try It", "πŸ“‹ Browse Examples", "πŸ“Š Performance"])
# ══════════════════════════════════════════════════════════════════════
# TAB 1 β€” Try It
# ══════════════════════════════════════════════════════════════════════
with tab_try:
norm = load_normalizer()
df = load_data()
# ── Free-form input (prominent) ───────────────────────────────────
st.subheader("Type a query to normalize")
st.caption("Try typos, missing spaces, scrambled product names, flight numbers, stock tickers…")
user_query = st.text_input(
"Query input",
placeholder="e.g. wheather nyc Β· 163 SQ Β· bestbuyt Β· nearme Β· 15 iphone Β· AAPL stock",
label_visibility="collapsed",
key="user_query",
)
if user_query.strip():
result = norm.normalize(user_query.strip())
if result.lower() == user_query.strip().lower():
st.success(f"**`{user_query.strip()}`** β†’ no change needed β†’ **`{result}`**")
else:
st.info(f"**`{user_query.strip()}`** β†’ **`{result}`**")
# Check if it's in the benchmark dataset
match = df[df["noisy"].str.lower() == user_query.strip().lower()]
if len(match):
row = match.iloc[0]
cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0]
if result == row["canonical"]:
note = f"βœ… Matches expected output `{row['canonical']}`"
else:
note = f"Expected `{row['canonical']}` Β· benchmark outcome: **{row['outcome']}**"
st.caption(f"_Found in benchmark Β· {cat_label} Β· {note}_")
st.divider()
# ── Example picker ────────────────────────────────────────────────
st.subheader("Or pick an example from the benchmark")
pick_col1, pick_col2 = st.columns(2)
with pick_col1:
cat_pick = st.selectbox(
"Category",
["All"] + list(CATEGORY_INFO.keys()),
format_func=lambda k: "All categories" if k == "All" else CATEGORY_INFO[k][0],
key="cat_pick",
)
with pick_col2:
show_errors_only = st.checkbox("Errors / failures only", value=False)
sub = df if cat_pick == "All" else df[df["category"] == cat_pick]
if show_errors_only:
sub = sub[~sub["em"]]
if len(sub) == 0:
st.info("No examples match these filters.")
else:
example_labels = [
f"{row.noisy} [{CATEGORY_INFO.get(row.category, (row.category,''))[0]}]"
for row in sub.itertuples()
]
picked_label = st.selectbox("Example", example_labels, key="example_pick")
picked_noisy = picked_label.split(" [")[0]
row = sub[sub["noisy"] == picked_noisy].iloc[0]
ex_left, ex_right = st.columns([3, 1])
with ex_left:
t0 = time.perf_counter()
ex_result = norm.normalize(picked_noisy)
elapsed_ms = (time.perf_counter() - t0) * 1000
st.markdown(f"**Input:** `{picked_noisy}`")
st.markdown(f"**Expected:** `{row['canonical']}`")
if ex_result == row["canonical"]:
st.success(f"**Got:** `{ex_result}` βœ…")
elif ex_result.lower() == picked_noisy.lower():
st.error(f"**Got:** `{ex_result}` β€” normalizer didn't fix it")
else:
st.warning(f"**Got:** `{ex_result}` β€” expected `{row['canonical']}`")
with ex_right:
st.metric("Latency", f"{elapsed_ms:.2f} ms")
cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0]
st.caption(cat_label)
st.caption(CATEGORY_INFO.get(row["category"], ("", row["category"]))[1])
# ══════════════════════════════════════════════════════════════════════
# TAB 2 β€” Browse Examples
# ══════════════════════════════════════════════════════════════════════
with tab_browse:
df = load_data()
f1, f2 = st.columns(2)
with f1:
cats = st.multiselect(
"Categories",
options=list(CATEGORY_INFO.keys()),
default=list(CATEGORY_INFO.keys()),
format_func=lambda k: CATEGORY_INFO[k][0],
)
with f2:
outcomes = st.multiselect(
"Outcomes",
options=OUTCOME_ORDER,
default=OUTCOME_ORDER,
)
filtered = df[df["category"].isin(cats) & df["outcome"].isin(outcomes)]
st.caption(f"Showing **{len(filtered)}** of {len(df)} examples")
display = filtered[["noisy", "pred", "canonical", "category", "outcome"]].copy()
display.columns = ["Input (noisy)", "Predicted", "Expected", "Category", "Outcome"]
display["Category"] = display["Category"].map(
lambda k: CATEGORY_INFO.get(k, (k, ""))[0]
)
st.dataframe(
display,
use_container_width=True,
hide_index=True,
height=540,
column_config={
"Input (noisy)": st.column_config.TextColumn(width="medium"),
"Predicted": st.column_config.TextColumn(width="medium"),
"Expected": st.column_config.TextColumn(width="medium"),
"Category": st.column_config.TextColumn(width="medium"),
"Outcome": st.column_config.TextColumn(width="small"),
},
)
# ══════════════════════════════════════════════════════════════════════
# TAB 3 β€” Performance
# ══════════════════════════════════════════════════════════════════════
with tab_perf:
df = load_data()
needs_change = df[df["should_change"]]
no_change = df[~df["should_change"]]
c1, c2, c3, c4 = st.columns(4)
c1.metric("Total examples", f"{len(df)}")
c2.metric("Overall EM", f"{df['em'].mean():.1%}")
c3.metric("Fix accuracy", f"{needs_change['em'].mean():.1%}",
help="Exact match on queries that SHOULD change")
c4.metric("No-change precision", f"{no_change['em'].mean():.1%}",
help="Correctly left unchanged queries that should NOT change")
st.markdown("---")
st.subheader("Per-category breakdown")
rows = []
for cat, (label, desc) in CATEGORY_INFO.items():
sub = df[df["category"] == cat]
if len(sub) == 0:
continue
needs = sub[sub["should_change"]]
ok = sub[~sub["should_change"]]
rows.append({
"Category": label,
"n": len(sub),
"EM %": f"{sub['em'].mean():.0%}",
"Fix accuracy": f"{needs['em'].mean():.0%}" if len(needs) else "β€”",
"No-change prec.": f"{ok['em'].mean():.0%}" if len(ok) else "β€”",
"Errors": int((~sub["em"]).sum()),
"What it tests": desc,
})
st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
st.markdown("---")
st.subheader("Failure cases by category")
st.caption("All queries where the normalizer produced a wrong output.")
failures = df[~df["em"]]
if len(failures) == 0:
st.success("No failures!")
else:
for cat, (label, _) in CATEGORY_INFO.items():
sub = failures[failures["category"] == cat]
if len(sub) == 0:
continue
with st.expander(f"{label} β€” {len(sub)} failure{'s' if len(sub) != 1 else ''}"):
show = sub[["noisy", "pred", "canonical", "outcome"]].copy()
show.columns = ["Input", "Predicted", "Expected", "Outcome"]
st.dataframe(show, use_container_width=True, hide_index=True)