Spaces:
Running
Running
| import time | |
| from pathlib import Path | |
| import pandas as pd | |
| import streamlit as st | |
| st.set_page_config( | |
| page_title="Firefox Query Normalizer", | |
| page_icon="π", | |
| layout="wide", | |
| ) | |
| HERE = Path(__file__).parent | |
| # βββ Normalizer (loaded once, cached across reruns) βββββββββββββββββββββββββββ | |
| def load_normalizer(): | |
| from benchmark import CombinedV2Normalizer # noqa: PLC0415 | |
| return CombinedV2Normalizer() | |
| # βββ Data βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_data() -> pd.DataFrame: | |
| df = pd.read_csv(HERE / "results.csv") | |
| df["should_change"] = df["should_change"].astype(bool) | |
| df["em"] = df["em"].astype(bool) | |
| df["outcome"] = df.apply(_classify_outcome, axis=1) | |
| return df | |
| def _classify_outcome(row) -> str: | |
| if row["should_change"]: | |
| if row["em"]: | |
| return "β Fixed correctly" | |
| elif str(row["pred"]).strip().lower() == str(row["noisy"]).strip().lower(): | |
| return "β Not fixed" | |
| else: | |
| return "β οΈ Fixed incorrectly" | |
| else: | |
| return "β Left unchanged" if row["em"] else "β Over-corrected" | |
| CATEGORY_INFO: dict[str, tuple[str, str]] = { | |
| "single_typo": ("βοΈ Single Typo", "One misspelled word (e.g. 'wheather' β 'weather')"), | |
| "multi_typo": ("βοΈ Multi Typo", "Two or more typos in the same query"), | |
| "brand_typo": ("π·οΈ Brand Typo", "Brand name misspelled (e.g. 'bestbuyt' β 'best buy')"), | |
| "flight_order": ("βοΈ Flight Order", "Flight number tokens reordered (e.g. '163 SQ' β 'SQ163')"), | |
| "product_order": ("π± Product Order", "Product tokens reordered (e.g. '15 iphone' β 'iphone 15')"), | |
| "stock_canon": ("π Stock Ticker", "Stock query β ticker only (e.g. 'AAPL stock' β 'AAPL')"), | |
| "spacing": ("β΅ Spacing", "Missing spaces fixed (e.g. 'nearme' β 'near me')"), | |
| "no_change": ("π No Change", "Should not be modified β tests over-correction resistance"), | |
| } | |
| OUTCOME_ORDER = [ | |
| "β Fixed correctly", | |
| "β Left unchanged", | |
| "β Not fixed", | |
| "β οΈ Fixed incorrectly", | |
| "β Over-corrected", | |
| ] | |
| # βββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.title("π Query Normalizer") | |
| st.caption("**CombinedV2** pipeline Β· Preprocessing stage for Merino intent classification") | |
| with st.expander("βΉοΈ What is this and why does it matter?", expanded=False): | |
| st.markdown(""" | |
| Intent detection tries to classify user queries by intents β | |
| navigational, local, commercial, etc. β to surface the right suggestions. | |
| Real queries are noisy: users make typos, omit spaces, or enter tokens in the | |
| wrong order. | |
| **CombinedV2** is a lightweight rule + dictionary normalizer that runs in **< 1 ms** | |
| per query. It runs 4 steps in sequence and short-circuits as soon as a fix is made: | |
| | Step | What it handles | Example | | |
| |------|----------------|---------| | |
| | **1 Β· Rules** | Flight IDs, stock tickers, product token reordering | `163 SQ` β `SQ163` | | |
| | **2 Β· RapidFuzz** | Fuzzy brand matching (single-token only) | `bestbuyt` β `best buy` | | |
| | **3 Β· SymSpell** | Concatenated word splitting | `nearme` β `near me` | | |
| | **4 Β· GuardedPySpell** | Spell correction (skips β€4-char tokens & ALL_CAPS) | `wheather nyc` β `weather nyc` | | |
| **Benchmark results across 299 queries in 8 categories:** | |
| | Metric | Score | | |
| |--------|-------| | |
| | Exact match on queries that need fixing | **73.2%** | | |
| | Precision on queries that should NOT change | **98.5%** | | |
| | Median latency (p50) | **0.03 ms** | | |
| """) | |
| st.divider() | |
| # βββ Tabs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tab_try, tab_browse, tab_perf = st.tabs(["π€ Try It", "π Browse Examples", "π Performance"]) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 1 β Try It | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_try: | |
| norm = load_normalizer() | |
| df = load_data() | |
| # ββ Free-form input (prominent) βββββββββββββββββββββββββββββββββββ | |
| st.subheader("Type a query to normalize") | |
| st.caption("Try typos, missing spaces, scrambled product names, flight numbers, stock tickersβ¦") | |
| user_query = st.text_input( | |
| "Query input", | |
| placeholder="e.g. wheather nyc Β· 163 SQ Β· bestbuyt Β· nearme Β· 15 iphone Β· AAPL stock", | |
| label_visibility="collapsed", | |
| key="user_query", | |
| ) | |
| if user_query.strip(): | |
| result = norm.normalize(user_query.strip()) | |
| if result.lower() == user_query.strip().lower(): | |
| st.success(f"**`{user_query.strip()}`** β no change needed β **`{result}`**") | |
| else: | |
| st.info(f"**`{user_query.strip()}`** β **`{result}`**") | |
| # Check if it's in the benchmark dataset | |
| match = df[df["noisy"].str.lower() == user_query.strip().lower()] | |
| if len(match): | |
| row = match.iloc[0] | |
| cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0] | |
| if result == row["canonical"]: | |
| note = f"β Matches expected output `{row['canonical']}`" | |
| else: | |
| note = f"Expected `{row['canonical']}` Β· benchmark outcome: **{row['outcome']}**" | |
| st.caption(f"_Found in benchmark Β· {cat_label} Β· {note}_") | |
| st.divider() | |
| # ββ Example picker ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.subheader("Or pick an example from the benchmark") | |
| pick_col1, pick_col2 = st.columns(2) | |
| with pick_col1: | |
| cat_pick = st.selectbox( | |
| "Category", | |
| ["All"] + list(CATEGORY_INFO.keys()), | |
| format_func=lambda k: "All categories" if k == "All" else CATEGORY_INFO[k][0], | |
| key="cat_pick", | |
| ) | |
| with pick_col2: | |
| show_errors_only = st.checkbox("Errors / failures only", value=False) | |
| sub = df if cat_pick == "All" else df[df["category"] == cat_pick] | |
| if show_errors_only: | |
| sub = sub[~sub["em"]] | |
| if len(sub) == 0: | |
| st.info("No examples match these filters.") | |
| else: | |
| example_labels = [ | |
| f"{row.noisy} [{CATEGORY_INFO.get(row.category, (row.category,''))[0]}]" | |
| for row in sub.itertuples() | |
| ] | |
| picked_label = st.selectbox("Example", example_labels, key="example_pick") | |
| picked_noisy = picked_label.split(" [")[0] | |
| row = sub[sub["noisy"] == picked_noisy].iloc[0] | |
| ex_left, ex_right = st.columns([3, 1]) | |
| with ex_left: | |
| t0 = time.perf_counter() | |
| ex_result = norm.normalize(picked_noisy) | |
| elapsed_ms = (time.perf_counter() - t0) * 1000 | |
| st.markdown(f"**Input:** `{picked_noisy}`") | |
| st.markdown(f"**Expected:** `{row['canonical']}`") | |
| if ex_result == row["canonical"]: | |
| st.success(f"**Got:** `{ex_result}` β ") | |
| elif ex_result.lower() == picked_noisy.lower(): | |
| st.error(f"**Got:** `{ex_result}` β normalizer didn't fix it") | |
| else: | |
| st.warning(f"**Got:** `{ex_result}` β expected `{row['canonical']}`") | |
| with ex_right: | |
| st.metric("Latency", f"{elapsed_ms:.2f} ms") | |
| cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0] | |
| st.caption(cat_label) | |
| st.caption(CATEGORY_INFO.get(row["category"], ("", row["category"]))[1]) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 2 β Browse Examples | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_browse: | |
| df = load_data() | |
| f1, f2 = st.columns(2) | |
| with f1: | |
| cats = st.multiselect( | |
| "Categories", | |
| options=list(CATEGORY_INFO.keys()), | |
| default=list(CATEGORY_INFO.keys()), | |
| format_func=lambda k: CATEGORY_INFO[k][0], | |
| ) | |
| with f2: | |
| outcomes = st.multiselect( | |
| "Outcomes", | |
| options=OUTCOME_ORDER, | |
| default=OUTCOME_ORDER, | |
| ) | |
| filtered = df[df["category"].isin(cats) & df["outcome"].isin(outcomes)] | |
| st.caption(f"Showing **{len(filtered)}** of {len(df)} examples") | |
| display = filtered[["noisy", "pred", "canonical", "category", "outcome"]].copy() | |
| display.columns = ["Input (noisy)", "Predicted", "Expected", "Category", "Outcome"] | |
| display["Category"] = display["Category"].map( | |
| lambda k: CATEGORY_INFO.get(k, (k, ""))[0] | |
| ) | |
| st.dataframe( | |
| display, | |
| use_container_width=True, | |
| hide_index=True, | |
| height=540, | |
| column_config={ | |
| "Input (noisy)": st.column_config.TextColumn(width="medium"), | |
| "Predicted": st.column_config.TextColumn(width="medium"), | |
| "Expected": st.column_config.TextColumn(width="medium"), | |
| "Category": st.column_config.TextColumn(width="medium"), | |
| "Outcome": st.column_config.TextColumn(width="small"), | |
| }, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 3 β Performance | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_perf: | |
| df = load_data() | |
| needs_change = df[df["should_change"]] | |
| no_change = df[~df["should_change"]] | |
| c1, c2, c3, c4 = st.columns(4) | |
| c1.metric("Total examples", f"{len(df)}") | |
| c2.metric("Overall EM", f"{df['em'].mean():.1%}") | |
| c3.metric("Fix accuracy", f"{needs_change['em'].mean():.1%}", | |
| help="Exact match on queries that SHOULD change") | |
| c4.metric("No-change precision", f"{no_change['em'].mean():.1%}", | |
| help="Correctly left unchanged queries that should NOT change") | |
| st.markdown("---") | |
| st.subheader("Per-category breakdown") | |
| rows = [] | |
| for cat, (label, desc) in CATEGORY_INFO.items(): | |
| sub = df[df["category"] == cat] | |
| if len(sub) == 0: | |
| continue | |
| needs = sub[sub["should_change"]] | |
| ok = sub[~sub["should_change"]] | |
| rows.append({ | |
| "Category": label, | |
| "n": len(sub), | |
| "EM %": f"{sub['em'].mean():.0%}", | |
| "Fix accuracy": f"{needs['em'].mean():.0%}" if len(needs) else "β", | |
| "No-change prec.": f"{ok['em'].mean():.0%}" if len(ok) else "β", | |
| "Errors": int((~sub["em"]).sum()), | |
| "What it tests": desc, | |
| }) | |
| st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True) | |
| st.markdown("---") | |
| st.subheader("Failure cases by category") | |
| st.caption("All queries where the normalizer produced a wrong output.") | |
| failures = df[~df["em"]] | |
| if len(failures) == 0: | |
| st.success("No failures!") | |
| else: | |
| for cat, (label, _) in CATEGORY_INFO.items(): | |
| sub = failures[failures["category"] == cat] | |
| if len(sub) == 0: | |
| continue | |
| with st.expander(f"{label} β {len(sub)} failure{'s' if len(sub) != 1 else ''}"): | |
| show = sub[["noisy", "pred", "canonical", "outcome"]].copy() | |
| show.columns = ["Input", "Predicted", "Expected", "Outcome"] | |
| st.dataframe(show, use_container_width=True, hide_index=True) | |