import time from pathlib import Path import pandas as pd import streamlit as st st.set_page_config( page_title="Firefox Query Normalizer", page_icon="πŸ”", layout="wide", ) HERE = Path(__file__).parent # ─── Normalizer (loaded once, cached across reruns) ─────────────────────────── @st.cache_resource(show_spinner="Loading normalizer…") def load_normalizer(): from benchmark import CombinedV2Normalizer # noqa: PLC0415 return CombinedV2Normalizer() # ─── Data ───────────────────────────────────────────────────────────────────── @st.cache_data def load_data() -> pd.DataFrame: df = pd.read_csv(HERE / "results.csv") df["should_change"] = df["should_change"].astype(bool) df["em"] = df["em"].astype(bool) df["outcome"] = df.apply(_classify_outcome, axis=1) return df def _classify_outcome(row) -> str: if row["should_change"]: if row["em"]: return "βœ… Fixed correctly" elif str(row["pred"]).strip().lower() == str(row["noisy"]).strip().lower(): return "❌ Not fixed" else: return "⚠️ Fixed incorrectly" else: return "βœ… Left unchanged" if row["em"] else "❌ Over-corrected" CATEGORY_INFO: dict[str, tuple[str, str]] = { "single_typo": ("✏️ Single Typo", "One misspelled word (e.g. 'wheather' β†’ 'weather')"), "multi_typo": ("✏️ Multi Typo", "Two or more typos in the same query"), "brand_typo": ("🏷️ Brand Typo", "Brand name misspelled (e.g. 'bestbuyt' β†’ 'best buy')"), "flight_order": ("✈️ Flight Order", "Flight number tokens reordered (e.g. '163 SQ' β†’ 'SQ163')"), "product_order": ("πŸ“± Product Order", "Product tokens reordered (e.g. '15 iphone' β†’ 'iphone 15')"), "stock_canon": ("πŸ“ˆ Stock Ticker", "Stock query β†’ ticker only (e.g. 'AAPL stock' β†’ 'AAPL')"), "spacing": ("⎡ Spacing", "Missing spaces fixed (e.g. 'nearme' β†’ 'near me')"), "no_change": ("πŸ”’ No Change", "Should not be modified β€” tests over-correction resistance"), } OUTCOME_ORDER = [ "βœ… Fixed correctly", "βœ… Left unchanged", "❌ Not fixed", "⚠️ Fixed incorrectly", "❌ Over-corrected", ] # ─── Header ─────────────────────────────────────────────────────────────────── st.title("πŸ” Query Normalizer") st.caption("**CombinedV2** pipeline Β· Preprocessing stage for Merino intent classification") with st.expander("ℹ️ What is this and why does it matter?", expanded=False): st.markdown(""" Intent detection tries to classify user queries by intents β€” navigational, local, commercial, etc. β€” to surface the right suggestions. Real queries are noisy: users make typos, omit spaces, or enter tokens in the wrong order. **CombinedV2** is a lightweight rule + dictionary normalizer that runs in **< 1 ms** per query. It runs 4 steps in sequence and short-circuits as soon as a fix is made: | Step | What it handles | Example | |------|----------------|---------| | **1 Β· Rules** | Flight IDs, stock tickers, product token reordering | `163 SQ` β†’ `SQ163` | | **2 Β· RapidFuzz** | Fuzzy brand matching (single-token only) | `bestbuyt` β†’ `best buy` | | **3 Β· SymSpell** | Concatenated word splitting | `nearme` β†’ `near me` | | **4 Β· GuardedPySpell** | Spell correction (skips ≀4-char tokens & ALL_CAPS) | `wheather nyc` β†’ `weather nyc` | **Benchmark results across 299 queries in 8 categories:** | Metric | Score | |--------|-------| | Exact match on queries that need fixing | **73.2%** | | Precision on queries that should NOT change | **98.5%** | | Median latency (p50) | **0.03 ms** | """) st.divider() # ─── Tabs ───────────────────────────────────────────────────────────────────── tab_try, tab_browse, tab_perf = st.tabs(["πŸ”€ Try It", "πŸ“‹ Browse Examples", "πŸ“Š Performance"]) # ══════════════════════════════════════════════════════════════════════ # TAB 1 β€” Try It # ══════════════════════════════════════════════════════════════════════ with tab_try: norm = load_normalizer() df = load_data() # ── Free-form input (prominent) ─────────────────────────────────── st.subheader("Type a query to normalize") st.caption("Try typos, missing spaces, scrambled product names, flight numbers, stock tickers…") user_query = st.text_input( "Query input", placeholder="e.g. wheather nyc Β· 163 SQ Β· bestbuyt Β· nearme Β· 15 iphone Β· AAPL stock", label_visibility="collapsed", key="user_query", ) if user_query.strip(): result = norm.normalize(user_query.strip()) if result.lower() == user_query.strip().lower(): st.success(f"**`{user_query.strip()}`** β†’ no change needed β†’ **`{result}`**") else: st.info(f"**`{user_query.strip()}`** β†’ **`{result}`**") # Check if it's in the benchmark dataset match = df[df["noisy"].str.lower() == user_query.strip().lower()] if len(match): row = match.iloc[0] cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0] if result == row["canonical"]: note = f"βœ… Matches expected output `{row['canonical']}`" else: note = f"Expected `{row['canonical']}` Β· benchmark outcome: **{row['outcome']}**" st.caption(f"_Found in benchmark Β· {cat_label} Β· {note}_") st.divider() # ── Example picker ──────────────────────────────────────────────── st.subheader("Or pick an example from the benchmark") pick_col1, pick_col2 = st.columns(2) with pick_col1: cat_pick = st.selectbox( "Category", ["All"] + list(CATEGORY_INFO.keys()), format_func=lambda k: "All categories" if k == "All" else CATEGORY_INFO[k][0], key="cat_pick", ) with pick_col2: show_errors_only = st.checkbox("Errors / failures only", value=False) sub = df if cat_pick == "All" else df[df["category"] == cat_pick] if show_errors_only: sub = sub[~sub["em"]] if len(sub) == 0: st.info("No examples match these filters.") else: example_labels = [ f"{row.noisy} [{CATEGORY_INFO.get(row.category, (row.category,''))[0]}]" for row in sub.itertuples() ] picked_label = st.selectbox("Example", example_labels, key="example_pick") picked_noisy = picked_label.split(" [")[0] row = sub[sub["noisy"] == picked_noisy].iloc[0] ex_left, ex_right = st.columns([3, 1]) with ex_left: t0 = time.perf_counter() ex_result = norm.normalize(picked_noisy) elapsed_ms = (time.perf_counter() - t0) * 1000 st.markdown(f"**Input:** `{picked_noisy}`") st.markdown(f"**Expected:** `{row['canonical']}`") if ex_result == row["canonical"]: st.success(f"**Got:** `{ex_result}` βœ…") elif ex_result.lower() == picked_noisy.lower(): st.error(f"**Got:** `{ex_result}` β€” normalizer didn't fix it") else: st.warning(f"**Got:** `{ex_result}` β€” expected `{row['canonical']}`") with ex_right: st.metric("Latency", f"{elapsed_ms:.2f} ms") cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0] st.caption(cat_label) st.caption(CATEGORY_INFO.get(row["category"], ("", row["category"]))[1]) # ══════════════════════════════════════════════════════════════════════ # TAB 2 β€” Browse Examples # ══════════════════════════════════════════════════════════════════════ with tab_browse: df = load_data() f1, f2 = st.columns(2) with f1: cats = st.multiselect( "Categories", options=list(CATEGORY_INFO.keys()), default=list(CATEGORY_INFO.keys()), format_func=lambda k: CATEGORY_INFO[k][0], ) with f2: outcomes = st.multiselect( "Outcomes", options=OUTCOME_ORDER, default=OUTCOME_ORDER, ) filtered = df[df["category"].isin(cats) & df["outcome"].isin(outcomes)] st.caption(f"Showing **{len(filtered)}** of {len(df)} examples") display = filtered[["noisy", "pred", "canonical", "category", "outcome"]].copy() display.columns = ["Input (noisy)", "Predicted", "Expected", "Category", "Outcome"] display["Category"] = display["Category"].map( lambda k: CATEGORY_INFO.get(k, (k, ""))[0] ) st.dataframe( display, use_container_width=True, hide_index=True, height=540, column_config={ "Input (noisy)": st.column_config.TextColumn(width="medium"), "Predicted": st.column_config.TextColumn(width="medium"), "Expected": st.column_config.TextColumn(width="medium"), "Category": st.column_config.TextColumn(width="medium"), "Outcome": st.column_config.TextColumn(width="small"), }, ) # ══════════════════════════════════════════════════════════════════════ # TAB 3 β€” Performance # ══════════════════════════════════════════════════════════════════════ with tab_perf: df = load_data() needs_change = df[df["should_change"]] no_change = df[~df["should_change"]] c1, c2, c3, c4 = st.columns(4) c1.metric("Total examples", f"{len(df)}") c2.metric("Overall EM", f"{df['em'].mean():.1%}") c3.metric("Fix accuracy", f"{needs_change['em'].mean():.1%}", help="Exact match on queries that SHOULD change") c4.metric("No-change precision", f"{no_change['em'].mean():.1%}", help="Correctly left unchanged queries that should NOT change") st.markdown("---") st.subheader("Per-category breakdown") rows = [] for cat, (label, desc) in CATEGORY_INFO.items(): sub = df[df["category"] == cat] if len(sub) == 0: continue needs = sub[sub["should_change"]] ok = sub[~sub["should_change"]] rows.append({ "Category": label, "n": len(sub), "EM %": f"{sub['em'].mean():.0%}", "Fix accuracy": f"{needs['em'].mean():.0%}" if len(needs) else "β€”", "No-change prec.": f"{ok['em'].mean():.0%}" if len(ok) else "β€”", "Errors": int((~sub["em"]).sum()), "What it tests": desc, }) st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True) st.markdown("---") st.subheader("Failure cases by category") st.caption("All queries where the normalizer produced a wrong output.") failures = df[~df["em"]] if len(failures) == 0: st.success("No failures!") else: for cat, (label, _) in CATEGORY_INFO.items(): sub = failures[failures["category"] == cat] if len(sub) == 0: continue with st.expander(f"{label} β€” {len(sub)} failure{'s' if len(sub) != 1 else ''}"): show = sub[["noisy", "pred", "canonical", "outcome"]].copy() show.columns = ["Input", "Predicted", "Expected", "Outcome"] st.dataframe(show, use_container_width=True, hide_index=True)