vazish commited on
Commit
7b10861
·
unverified ·
1 Parent(s): 540ec31

query normalization

Browse files
Files changed (5) hide show
  1. app.py +305 -0
  2. benchmark.py +820 -0
  3. dataset.csv +300 -0
  4. requirements.txt +5 -2
  5. results.csv +300 -0
app.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import streamlit as st
6
+
7
+ st.set_page_config(
8
+ page_title="Firefox Query Normalizer",
9
+ page_icon="🔍",
10
+ layout="wide",
11
+ )
12
+
13
+ HERE = Path(__file__).parent
14
+
15
+
16
+ # ─── Normalizer (loaded once, cached across reruns) ───────────────────────────
17
+
18
+ @st.cache_resource(show_spinner="Loading normalizer…")
19
+ def load_normalizer():
20
+ from benchmark import CombinedV2Normalizer # noqa: PLC0415
21
+ return CombinedV2Normalizer()
22
+
23
+
24
+ # ─── Data ─────────────────────────────────────────────────────────────────────
25
+
26
+ @st.cache_data
27
+ def load_data() -> pd.DataFrame:
28
+ df = pd.read_csv(HERE / "results.csv")
29
+ df["should_change"] = df["should_change"].astype(bool)
30
+ df["em"] = df["em"].astype(bool)
31
+ df["outcome"] = df.apply(_classify_outcome, axis=1)
32
+ return df
33
+
34
+
35
+ def _classify_outcome(row) -> str:
36
+ if row["should_change"]:
37
+ if row["em"]:
38
+ return "✅ Fixed correctly"
39
+ elif str(row["pred"]).strip().lower() == str(row["noisy"]).strip().lower():
40
+ return "❌ Not fixed"
41
+ else:
42
+ return "⚠️ Fixed incorrectly"
43
+ else:
44
+ return "✅ Left unchanged" if row["em"] else "❌ Over-corrected"
45
+
46
+
47
+ CATEGORY_INFO: dict[str, tuple[str, str]] = {
48
+ "single_typo": ("✏️ Single Typo", "One misspelled word (e.g. 'wheather' → 'weather')"),
49
+ "multi_typo": ("✏️ Multi Typo", "Two or more typos in the same query"),
50
+ "brand_typo": ("🏷️ Brand Typo", "Brand name misspelled (e.g. 'bestbuyt' → 'best buy')"),
51
+ "flight_order": ("✈️ Flight Order", "Flight number tokens reordered (e.g. '163 SQ' → 'SQ163')"),
52
+ "product_order": ("📱 Product Order", "Product tokens reordered (e.g. '15 iphone' → 'iphone 15')"),
53
+ "stock_canon": ("📈 Stock Ticker", "Stock query → ticker only (e.g. 'AAPL stock' → 'AAPL')"),
54
+ "spacing": ("⎵ Spacing", "Missing spaces fixed (e.g. 'nearme' → 'near me')"),
55
+ "no_change": ("🔒 No Change", "Should not be modified — tests over-correction resistance"),
56
+ }
57
+
58
+ OUTCOME_ORDER = [
59
+ "✅ Fixed correctly",
60
+ "✅ Left unchanged",
61
+ "❌ Not fixed",
62
+ "⚠️ Fixed incorrectly",
63
+ "❌ Over-corrected",
64
+ ]
65
+
66
+
67
+ # ─── Header ───────────────────────────────────────────────────────────────────
68
+
69
+ st.title("🔍 Query Normalizer")
70
+ st.caption("**CombinedV2** pipeline · Preprocessing stage for Merino intent classification")
71
+
72
+ with st.expander("ℹ️ What is this and why does it matter?", expanded=False):
73
+ st.markdown("""
74
+ Intent detection tries to classify user queries by intents —
75
+ navigational, local, commercial, etc. — to surface the right suggestions.
76
+ Real queries are noisy: users make typos, omit spaces, or enter tokens in the
77
+ wrong order.
78
+
79
+ **CombinedV2** is a lightweight rule + dictionary normalizer that runs in **< 1 ms**
80
+ per query. It runs 4 steps in sequence and short-circuits as soon as a fix is made:
81
+
82
+ | Step | What it handles | Example |
83
+ |------|----------------|---------|
84
+ | **1 · Rules** | Flight IDs, stock tickers, product token reordering | `163 SQ` → `SQ163` |
85
+ | **2 · RapidFuzz** | Fuzzy brand matching (single-token only) | `bestbuyt` → `best buy` |
86
+ | **3 · SymSpell** | Concatenated word splitting | `nearme` → `near me` |
87
+ | **4 · GuardedPySpell** | Spell correction (skips ≤4-char tokens & ALL_CAPS) | `wheather nyc` → `weather nyc` |
88
+
89
+ **Benchmark results across 299 queries in 8 categories:**
90
+
91
+ | Metric | Score |
92
+ |--------|-------|
93
+ | Exact match on queries that need fixing | **73.2%** |
94
+ | Precision on queries that should NOT change | **98.5%** |
95
+ | Median latency (p50) | **0.03 ms** |
96
+ """)
97
+
98
+ st.divider()
99
+
100
+ # ─── Tabs ─────────────────────────────────────────────────────────────────────
101
+
102
+ tab_try, tab_browse, tab_perf = st.tabs(["🔤 Try It", "📋 Browse Examples", "📊 Performance"])
103
+
104
+
105
+ # ══════════════════════════════════════════════════════════════════════
106
+ # TAB 1 — Try It
107
+ # ══════════════════════════════════════════════════════════════════════
108
+
109
+ with tab_try:
110
+ norm = load_normalizer()
111
+ df = load_data()
112
+
113
+ # ── Free-form input (prominent) ───────────────────────────────────
114
+ st.subheader("Type a query to normalize")
115
+ st.caption("Try typos, missing spaces, scrambled product names, flight numbers, stock tickers…")
116
+
117
+ user_query = st.text_input(
118
+ "Query input",
119
+ placeholder="e.g. wheather nyc · 163 SQ · bestbuyt · nearme · 15 iphone · AAPL stock",
120
+ label_visibility="collapsed",
121
+ key="user_query",
122
+ )
123
+
124
+ if user_query.strip():
125
+ t0 = time.perf_counter()
126
+ result = norm.normalize(user_query.strip())
127
+ elapsed_ms = (time.perf_counter() - t0) * 1000
128
+
129
+ res_col, meta_col = st.columns([3, 1])
130
+ with res_col:
131
+ if result.lower() == user_query.strip().lower():
132
+ st.success(f"**`{user_query.strip()}`** → no change needed → **`{result}`**")
133
+ else:
134
+ st.info(f"**`{user_query.strip()}`** → **`{result}`**")
135
+
136
+ with meta_col:
137
+ st.metric("Latency", f"{elapsed_ms:.2f} ms")
138
+
139
+ # Check if it's in the benchmark dataset
140
+ match = df[df["noisy"].str.lower() == user_query.strip().lower()]
141
+ if len(match):
142
+ row = match.iloc[0]
143
+ cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0]
144
+ if result == row["canonical"]:
145
+ note = f"✅ Matches expected output `{row['canonical']}`"
146
+ else:
147
+ note = f"Expected `{row['canonical']}` · benchmark outcome: **{row['outcome']}**"
148
+ st.caption(f"_Found in benchmark · {cat_label} · {note}_")
149
+
150
+ st.divider()
151
+
152
+ # ── Example picker ────────────────────────────────────────────────
153
+ st.subheader("Or pick an example from the benchmark")
154
+
155
+ pick_col1, pick_col2 = st.columns(2)
156
+ with pick_col1:
157
+ cat_pick = st.selectbox(
158
+ "Category",
159
+ ["All"] + list(CATEGORY_INFO.keys()),
160
+ format_func=lambda k: "All categories" if k == "All" else CATEGORY_INFO[k][0],
161
+ key="cat_pick",
162
+ )
163
+ with pick_col2:
164
+ show_errors_only = st.checkbox("Errors / failures only", value=False)
165
+
166
+ sub = df if cat_pick == "All" else df[df["category"] == cat_pick]
167
+ if show_errors_only:
168
+ sub = sub[~sub["em"]]
169
+
170
+ if len(sub) == 0:
171
+ st.info("No examples match these filters.")
172
+ else:
173
+ example_labels = [
174
+ f"{row.noisy} [{CATEGORY_INFO.get(row.category, (row.category,''))[0]}]"
175
+ for row in sub.itertuples()
176
+ ]
177
+ picked_label = st.selectbox("Example", example_labels, key="example_pick")
178
+ picked_noisy = picked_label.split(" [")[0]
179
+ row = sub[sub["noisy"] == picked_noisy].iloc[0]
180
+
181
+ ex_left, ex_right = st.columns([3, 1])
182
+ with ex_left:
183
+ t0 = time.perf_counter()
184
+ ex_result = norm.normalize(picked_noisy)
185
+ elapsed_ms = (time.perf_counter() - t0) * 1000
186
+
187
+ st.markdown(f"**Input:** `{picked_noisy}`")
188
+ st.markdown(f"**Expected:** `{row['canonical']}`")
189
+
190
+ if ex_result == row["canonical"]:
191
+ st.success(f"**Got:** `{ex_result}` ✅")
192
+ elif ex_result.lower() == picked_noisy.lower():
193
+ st.error(f"**Got:** `{ex_result}` — normalizer didn't fix it")
194
+ else:
195
+ st.warning(f"**Got:** `{ex_result}` — expected `{row['canonical']}`")
196
+
197
+ with ex_right:
198
+ st.metric("Latency", f"{elapsed_ms:.2f} ms")
199
+ cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0]
200
+ st.caption(cat_label)
201
+ st.caption(CATEGORY_INFO.get(row["category"], ("", row["category"]))[1])
202
+
203
+
204
+ # ══════════════════════════════════════════════════════════════════════
205
+ # TAB 2 — Browse Examples
206
+ # ══════════════════════════════════════════════════════════════════════
207
+
208
+ with tab_browse:
209
+ df = load_data()
210
+
211
+ f1, f2 = st.columns(2)
212
+ with f1:
213
+ cats = st.multiselect(
214
+ "Categories",
215
+ options=list(CATEGORY_INFO.keys()),
216
+ default=list(CATEGORY_INFO.keys()),
217
+ format_func=lambda k: CATEGORY_INFO[k][0],
218
+ )
219
+ with f2:
220
+ outcomes = st.multiselect(
221
+ "Outcomes",
222
+ options=OUTCOME_ORDER,
223
+ default=OUTCOME_ORDER,
224
+ )
225
+
226
+ filtered = df[df["category"].isin(cats) & df["outcome"].isin(outcomes)]
227
+ st.caption(f"Showing **{len(filtered)}** of {len(df)} examples")
228
+
229
+ display = filtered[["noisy", "pred", "canonical", "category", "outcome"]].copy()
230
+ display.columns = ["Input (noisy)", "Predicted", "Expected", "Category", "Outcome"]
231
+ display["Category"] = display["Category"].map(
232
+ lambda k: CATEGORY_INFO.get(k, (k, ""))[0]
233
+ )
234
+
235
+ st.dataframe(
236
+ display,
237
+ use_container_width=True,
238
+ hide_index=True,
239
+ height=540,
240
+ column_config={
241
+ "Input (noisy)": st.column_config.TextColumn(width="medium"),
242
+ "Predicted": st.column_config.TextColumn(width="medium"),
243
+ "Expected": st.column_config.TextColumn(width="medium"),
244
+ "Category": st.column_config.TextColumn(width="medium"),
245
+ "Outcome": st.column_config.TextColumn(width="small"),
246
+ },
247
+ )
248
+
249
+
250
+ # ══════════════════════════════════════════════════════════════════════
251
+ # TAB 3 — Performance
252
+ # ══════════════════════════════════════════════════════════════════════
253
+
254
+ with tab_perf:
255
+ df = load_data()
256
+
257
+ needs_change = df[df["should_change"]]
258
+ no_change = df[~df["should_change"]]
259
+
260
+ c1, c2, c3, c4 = st.columns(4)
261
+ c1.metric("Total examples", f"{len(df)}")
262
+ c2.metric("Overall EM", f"{df['em'].mean():.1%}")
263
+ c3.metric("Fix accuracy", f"{needs_change['em'].mean():.1%}",
264
+ help="Exact match on queries that SHOULD change")
265
+ c4.metric("No-change precision", f"{no_change['em'].mean():.1%}",
266
+ help="Correctly left unchanged queries that should NOT change")
267
+
268
+ st.markdown("---")
269
+ st.subheader("Per-category breakdown")
270
+
271
+ rows = []
272
+ for cat, (label, desc) in CATEGORY_INFO.items():
273
+ sub = df[df["category"] == cat]
274
+ if len(sub) == 0:
275
+ continue
276
+ needs = sub[sub["should_change"]]
277
+ ok = sub[~sub["should_change"]]
278
+ rows.append({
279
+ "Category": label,
280
+ "n": len(sub),
281
+ "EM %": f"{sub['em'].mean():.0%}",
282
+ "Fix accuracy": f"{needs['em'].mean():.0%}" if len(needs) else "—",
283
+ "No-change prec.": f"{ok['em'].mean():.0%}" if len(ok) else "—",
284
+ "Errors": int((~sub["em"]).sum()),
285
+ "What it tests": desc,
286
+ })
287
+
288
+ st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
289
+
290
+ st.markdown("---")
291
+ st.subheader("Failure cases by category")
292
+ st.caption("All queries where the normalizer produced a wrong output.")
293
+
294
+ failures = df[~df["em"]]
295
+ if len(failures) == 0:
296
+ st.success("No failures!")
297
+ else:
298
+ for cat, (label, _) in CATEGORY_INFO.items():
299
+ sub = failures[failures["category"] == cat]
300
+ if len(sub) == 0:
301
+ continue
302
+ with st.expander(f"{label} — {len(sub)} failure{'s' if len(sub) != 1 else ''}"):
303
+ show = sub[["noisy", "pred", "canonical", "outcome"]].copy()
304
+ show.columns = ["Input", "Predicted", "Expected", "Outcome"]
305
+ st.dataframe(show, use_container_width=True, hide_index=True)
benchmark.py ADDED
@@ -0,0 +1,820 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query Normalization Benchmark
3
+ ==============================
4
+ Benchmarks multiple normalization approaches on the generated dataset.
5
+
6
+ Normalizers:
7
+ 1. Identity - baseline, no change
8
+ 2. PySpellChecker - token-by-token spell correction (current approach)
9
+ 3. SymSpell - faster, supports compound word correction
10
+ 4. Rules - regex + entity canonicalization (flight IDs, stock tickers, product spacing)
11
+ 5. RapidFuzz - fuzzy brand name matching
12
+ 6. Combined - Rules → SymSpell → RapidFuzz pipeline
13
+ --- ML ---
14
+ 7. ContextualSpellCheck - spaCy pipeline with BERT contextual embeddings
15
+ 8. T5SpellCorrector - HuggingFace T5 fine-tuned for spelling correction
16
+ 9. CombinedML - Rules → T5 pipeline (entity rules first, T5 for the rest)
17
+
18
+ Metrics (per normalizer, per category):
19
+ exact_match - % where output == canonical (case-insensitive)
20
+ cer - character error rate: edit_dist / max(len_pred, len_gold)
21
+ wer - word error rate: token-level edit distance / n_gold_tokens
22
+ no_change_precision - on no_change rows: % correctly left unchanged
23
+ over_correction - on no_change rows: % wrongly changed
24
+ latency_mean_ms - mean per-query latency
25
+ latency_p50_ms - p50 latency
26
+ latency_p95_ms - p95 latency
27
+ latency_p99_ms - p99 latency
28
+
29
+ Usage:
30
+ pip install -r requirements.txt
31
+ python3 benchmark.py [--dataset dataset.csv]
32
+ """
33
+
34
+ import re
35
+ import sys
36
+ import time
37
+ import argparse
38
+ import warnings
39
+ import numpy as np
40
+ import pandas as pd
41
+ from pathlib import Path
42
+ from abc import ABC, abstractmethod
43
+ from typing import Optional
44
+
45
+ warnings.filterwarnings("ignore")
46
+
47
+ # ── Optional imports ───────────────────────────────────────────────────────────
48
+
49
+ try:
50
+ from Levenshtein import distance as _lev
51
+ def edit_distance(a: str, b: str) -> int: return _lev(a, b)
52
+ except ImportError:
53
+ # Pure-python fallback
54
+ def edit_distance(a: str, b: str) -> int:
55
+ m, n = len(a), len(b)
56
+ dp = list(range(n + 1))
57
+ for i in range(1, m + 1):
58
+ prev = dp[:]
59
+ dp[0] = i
60
+ for j in range(1, n + 1):
61
+ dp[j] = prev[j - 1] if a[i-1] == b[j-1] else 1 + min(prev[j], dp[j-1], prev[j-1])
62
+ return dp[n]
63
+
64
+ try:
65
+ from spellchecker import SpellChecker as _SC
66
+ HAS_PYSPELL = True
67
+ except ImportError:
68
+ HAS_PYSPELL = False
69
+ print("Warning: pyspellchecker not installed — skipping PySpell normalizer")
70
+
71
+ try:
72
+ from symspellpy import SymSpell as _SS, Verbosity as _V
73
+ import pkg_resources
74
+ HAS_SYMSPELL = True
75
+ except ImportError:
76
+ HAS_SYMSPELL = False
77
+ print("Warning: symspellpy not installed — skipping SymSpell normalizer")
78
+
79
+ try:
80
+ from rapidfuzz import process as _rf_process, fuzz as _rf_fuzz
81
+ HAS_RAPIDFUZZ = True
82
+ except ImportError:
83
+ HAS_RAPIDFUZZ = False
84
+ print("Warning: rapidfuzz not installed — skipping RapidFuzz normalizer")
85
+
86
+ try:
87
+ import spacy as _spacy
88
+ import contextualSpellCheck as _csc
89
+ _csc_nlp = _spacy.load("en_core_web_sm")
90
+ _csc.add_to_pipe(_csc_nlp)
91
+ HAS_CONTEXTUAL = True
92
+ except Exception:
93
+ HAS_CONTEXTUAL = False
94
+ print("Warning: contextualSpellCheck/spacy not available — skipping ContextualSpellCheck normalizer")
95
+ print(" Install: pip install contextualSpellCheck && python -m spacy download en_core_web_sm")
96
+
97
+ try:
98
+ from transformers import pipeline as _hf_pipeline
99
+ HAS_TRANSFORMERS = True
100
+ except ImportError:
101
+ HAS_TRANSFORMERS = False
102
+ print("Warning: transformers not installed — skipping T5 normalizer")
103
+ print(" Install: pip install transformers torch")
104
+
105
+ # ── Brand list for fuzzy matching ──────────────────────────────────────────────
106
+
107
+ BRANDS = [
108
+ "amazon", "google", "facebook", "twitter", "instagram", "youtube",
109
+ "linkedin", "reddit", "netflix", "spotify", "microsoft", "adobe",
110
+ "dropbox", "github", "slack", "zoom", "paypal", "ebay", "walmart",
111
+ "target", "best buy", "new york times", "bbc", "cnn", "espn",
112
+ "gmail", "outlook", "yahoo", "apple", "samsung", "dell", "hp",
113
+ "lenovo", "asus", "acer", "toshiba", "sony", "lg", "panasonic",
114
+ "booking.com", "expedia", "airbnb", "tripadvisor", "yelp",
115
+ "doordash", "ubereats", "grubhub", "lyft", "uber",
116
+ "twitch", "discord", "telegram", "whatsapp", "snapchat", "tiktok",
117
+ ]
118
+
119
+ # ── Entity lists for rules normalizer ──────────────────────────────────────────
120
+
121
+ # Common IATA codes (2-3 letter airline codes)
122
+ IATA_CODES = {
123
+ "AA", "BA", "DL", "UA", "LH", "AF", "EK", "QR", "SQ", "CX",
124
+ "VS", "KL", "IB", "TK", "AC", "QF", "NH", "JL", "MH", "TG",
125
+ "AI", "SA", "ET", "KE", "OZ", "CI", "BR", "LA", "AV", "AM",
126
+ "WN", "B6", "AS", "F9", "NK", "G4", "VX", "HA",
127
+ }
128
+
129
+ # Common stock tickers → company name aliases
130
+ STOCK_ALIASES: dict[str, list[str]] = {
131
+ "AAPL": ["apple", "aapl"],
132
+ "TSLA": ["tesla", "tsla"],
133
+ "MSFT": ["microsoft", "msft"],
134
+ "GOOGL": ["google", "alphabet", "googl"],
135
+ "AMZN": ["amazon", "amzn"],
136
+ "META": ["meta", "facebook", "fb"],
137
+ "NVDA": ["nvidia", "nvda"],
138
+ "NFLX": ["netflix", "nflx"],
139
+ "PYPL": ["paypal", "pypl"],
140
+ "SNAP": ["snapchat", "snap"],
141
+ "AMD": ["amd"],
142
+ "INTC": ["intel", "intc"],
143
+ "UBER": ["uber"],
144
+ "LYFT": ["lyft"],
145
+ "ABNB": ["airbnb", "abnb"],
146
+ "COIN": ["coinbase", "coin"],
147
+ "HOOD": ["robinhood", "hood"],
148
+ }
149
+
150
+ # Reverse map: alias → ticker
151
+ _ALIAS_TO_TICKER: dict[str, str] = {}
152
+ for ticker, aliases in STOCK_ALIASES.items():
153
+ for alias in aliases:
154
+ _ALIAS_TO_TICKER[alias.lower()] = ticker
155
+
156
+ # Product model patterns: brand → canonical prefix
157
+ PRODUCT_BRANDS = ["iphone", "samsung", "macbook", "ipad", "pixel", "surface"]
158
+
159
+ # ── Base normalizer ────────────────────────────────────────────────────────────
160
+
161
+ class Normalizer(ABC):
162
+ name: str
163
+
164
+ def warmup(self) -> None:
165
+ """Called once before benchmarking to initialize any lazy state."""
166
+ pass
167
+
168
+ @abstractmethod
169
+ def normalize(self, query: str) -> str:
170
+ ...
171
+
172
+ def normalize_batch(self, queries: list[str]) -> list[str]:
173
+ return [self.normalize(q) for q in queries]
174
+
175
+
176
+ # ── 1. Identity (baseline) ────────────────────────────────────────────────────
177
+
178
+ class IdentityNormalizer(Normalizer):
179
+ name = "Identity (baseline)"
180
+
181
+ def normalize(self, query: str) -> str:
182
+ return query
183
+
184
+
185
+ # ── 2. PySpellChecker ────────────────────────────────────────────────────────
186
+
187
+ class PySpellNormalizer(Normalizer):
188
+ name = "PySpellChecker"
189
+
190
+ def __init__(self):
191
+ if not HAS_PYSPELL:
192
+ raise RuntimeError("pyspellchecker not installed")
193
+ self._sc = _SC()
194
+
195
+ def normalize(self, query: str) -> str:
196
+ words = query.lower().split()
197
+ return " ".join(self._sc.correction(w) or w for w in words)
198
+
199
+
200
+ # ── 3. SymSpell ───────────────────────────────────────────────────────────────
201
+
202
+ _ORCAS_VOCAB = Path(__file__).parent / "orcas_vocab.txt"
203
+
204
+
205
+ class SymSpellNormalizer(Normalizer):
206
+ name = "SymSpell"
207
+
208
+ def __init__(self, max_edit_distance: int = 2):
209
+ if not HAS_SYMSPELL:
210
+ raise RuntimeError("symspellpy not installed")
211
+ self._sym = _SS(max_dictionary_edit_distance=max_edit_distance)
212
+ # Try importlib.resources first (works in newer Python/packaging setups),
213
+ # fall back to pkg_resources for older environments.
214
+ _dict_loaded = False
215
+ # Try candidate dictionary filenames (name changed across symspellpy versions)
216
+ _DICT_CANDIDATES = ["frequency_dictionary_en_82_765.txt", "en-80k.txt"]
217
+ try:
218
+ import importlib.resources as _ir
219
+ for _fname in _DICT_CANDIDATES:
220
+ try:
221
+ _ref = _ir.files("symspellpy").joinpath(_fname)
222
+ with _ir.as_file(_ref) as _dp:
223
+ _dict_loaded = self._sym.load_dictionary(str(_dp), term_index=0, count_index=1)
224
+ if _dict_loaded:
225
+ break
226
+ except Exception:
227
+ pass
228
+ except Exception:
229
+ pass
230
+ if not _dict_loaded:
231
+ for _fname in _DICT_CANDIDATES:
232
+ _dp = pkg_resources.resource_filename("symspellpy", _fname)
233
+ _dict_loaded = self._sym.load_dictionary(_dp, term_index=0, count_index=1)
234
+ if _dict_loaded:
235
+ break
236
+ if _ORCAS_VOCAB.exists():
237
+ self._sym.load_dictionary(str(_ORCAS_VOCAB), term_index=0, count_index=1)
238
+ self.name = "SymSpell+ORCAS"
239
+ self._max_ed = max_edit_distance
240
+
241
+ def normalize(self, query: str) -> str:
242
+ # Use lookup_compound for multi-token correction
243
+ suggestions = self._sym.lookup_compound(
244
+ query.lower(), max_edit_distance=self._max_ed
245
+ )
246
+ if suggestions:
247
+ return suggestions[0].term
248
+ return query.lower()
249
+
250
+
251
+ # ── 4. Rules (entity + regex) ────────────────────────────────────────────────
252
+
253
+ class RulesNormalizer(Normalizer):
254
+ name = "Rules (entity + regex)"
255
+
256
+ # Flight: digits + IATA or IATA + digits → IATA + digits (no space)
257
+ _FLIGHT_LOOSE = re.compile(
258
+ r'\b(?:flight\s+)?(\d{2,4})\s*([A-Z]{2,3})\b' # 163 SQ
259
+ r'|'
260
+ r'\b(?:flight\s+)?([A-Z]{2,3})\s+(\d{2,4})\b', # SQ 163 (space)
261
+ re.IGNORECASE
262
+ )
263
+
264
+ # Product spacing: brand directly followed by digits/variant ("iphone15")
265
+ _PRODUCT_SPACING = re.compile(
266
+ r'\b(iphone|macbook|ipad|pixel|galaxy|surface|airpods)'
267
+ r'(\d+|pro|air|mini|max|ultra|plus)\b',
268
+ re.IGNORECASE
269
+ )
270
+
271
+ # Stock: remove surrounding noise, keep just the ticker
272
+ _STOCK_NOISE = re.compile(
273
+ r'\b(stock|share|price|shares|equity|ticker|market|trading|invest(?:ment)?)\b',
274
+ re.IGNORECASE
275
+ )
276
+
277
+ def _normalize_flight(self, query: str) -> str:
278
+ q_upper = query.upper()
279
+ def _repl(m):
280
+ if m.group(1): # digits IATA
281
+ num, code = m.group(1), m.group(2).upper()
282
+ else: # IATA digits
283
+ code, num = m.group(3).upper(), m.group(4)
284
+ if code in IATA_CODES:
285
+ return f"{code}{num}"
286
+ return m.group(0)
287
+ result = self._FLIGHT_LOOSE.sub(_repl, query)
288
+ return result
289
+
290
+ def _normalize_stock(self, query: str) -> Optional[str]:
291
+ ql = query.lower().strip()
292
+ tokens = ql.split()
293
+ # Check if any token is a known ticker or alias
294
+ found_ticker = None
295
+ for tok in tokens:
296
+ # Direct ticker match (uppercase)
297
+ if tok.upper() in STOCK_ALIASES:
298
+ found_ticker = tok.upper()
299
+ break
300
+ # Alias match
301
+ if tok in _ALIAS_TO_TICKER:
302
+ found_ticker = _ALIAS_TO_TICKER[tok]
303
+ if found_ticker:
304
+ # Case 1: stock noise words present (e.g. "AAPL stock price")
305
+ remaining = self._STOCK_NOISE.sub("", ql).strip()
306
+ if remaining != ql.strip():
307
+ return found_ticker
308
+ # Case 2: explicit ticker token present alongside alias
309
+ # (e.g. "apple aapl", "google GOOGL") — but NOT "google pixel 8"
310
+ if found_ticker.lower() in tokens:
311
+ return found_ticker
312
+ return None
313
+
314
+ def _normalize_product_spacing(self, query: str) -> str:
315
+ return self._PRODUCT_SPACING.sub(lambda m: f"{m.group(1)} {m.group(2)}", query)
316
+
317
+ def _normalize_word_order(self, query: str) -> str:
318
+ """Reorder product queries so the brand/product-line token comes first.
319
+
320
+ Handles patterns like:
321
+ 's24 samsung' → 'samsung s24'
322
+ 'pro 14 macbook' → 'macbook pro 14'
323
+ 'ultra s23 samsung'→ 'samsung ultra s23'
324
+ """
325
+ tokens = query.lower().split()
326
+ if len(tokens) < 2:
327
+ return query
328
+ # Find a PRODUCT_BRANDS token that is not already at position 0
329
+ for i, tok in enumerate(tokens):
330
+ if i > 0 and tok in PRODUCT_BRANDS:
331
+ # Move brand to front, preserve relative order of the rest
332
+ return " ".join([tok] + tokens[:i] + tokens[i + 1:])
333
+ return query
334
+
335
+ def normalize(self, query: str) -> str:
336
+ q = query.strip()
337
+
338
+ # 1. Stock canonicalization
339
+ stock = self._normalize_stock(q)
340
+ if stock:
341
+ return stock
342
+
343
+ # 2. Flight ID normalization
344
+ q = self._normalize_flight(q)
345
+
346
+ # 3. Product spacing
347
+ q = self._normalize_product_spacing(q)
348
+
349
+ # 4. Product word order
350
+ q = self._normalize_word_order(q)
351
+
352
+ # 5. Clean up extra whitespace
353
+ q = re.sub(r'\s+', ' ', q).strip()
354
+
355
+ return q
356
+
357
+
358
+ # ── 5. RapidFuzz (brand matching) ────────────────────────────────────────────
359
+
360
+ class RapidFuzzNormalizer(Normalizer):
361
+ name = "RapidFuzz (brand match)"
362
+
363
+ def __init__(self, score_cutoff: int = 82):
364
+ if not HAS_RAPIDFUZZ:
365
+ raise RuntimeError("rapidfuzz not installed")
366
+ self._cutoff = score_cutoff
367
+
368
+ def normalize(self, query: str) -> str:
369
+ ql = query.lower().strip()
370
+
371
+ # Only attempt brand correction on short queries (≤ 3 tokens)
372
+ tokens = ql.split()
373
+ if len(tokens) > 3:
374
+ return query
375
+
376
+ # Skip very short queries — too ambiguous to fuzzy-match safely
377
+ # (e.g. 'appl', 'npm', 'gcc' should not be matched to brand names)
378
+ if len(ql) <= 5:
379
+ return query
380
+
381
+ # Try matching each n-gram of the query against the brand list
382
+ # First try the full query, then try progressively smaller windows
383
+ result = _rf_process.extractOne(
384
+ ql, BRANDS,
385
+ scorer=_rf_fuzz.token_sort_ratio,
386
+ score_cutoff=self._cutoff,
387
+ )
388
+ if result:
389
+ best_match, score, _ = result
390
+ return best_match
391
+
392
+ return query
393
+
394
+
395
+ # ── 6. Combined ───────────────────────────────────────────────────────────────
396
+
397
+ class CombinedNormalizer(Normalizer):
398
+ name = "Combined (Rules + SymSpell + RapidFuzz)"
399
+
400
+ def __init__(self):
401
+ self._rules = RulesNormalizer()
402
+ self._symspell = SymSpellNormalizer() if HAS_SYMSPELL else None
403
+ self._rfuzz = RapidFuzzNormalizer() if HAS_RAPIDFUZZ else None
404
+
405
+ def normalize(self, query: str) -> str:
406
+ q = query.strip()
407
+
408
+ # Step 1: Apply entity/structural rules first (highest precision)
409
+ q_rules = self._rules.normalize(q)
410
+ if q_rules.lower() != q.lower():
411
+ return q_rules # Rules made a change — trust it
412
+
413
+ # Step 2: SymSpell for general typo correction
414
+ if self._symspell:
415
+ q_sym = self._symspell.normalize(q)
416
+ if q_sym.lower() != q.lower():
417
+ return q_sym
418
+
419
+ # Step 3: RapidFuzz for brand name typos (catches what SymSpell misses
420
+ # on compound brand names like "bestbuyt" → "best buy")
421
+ if self._rfuzz:
422
+ q_rf = self._rfuzz.normalize(q)
423
+ if q_rf.lower() != q.lower():
424
+ return q_rf
425
+
426
+ return q
427
+
428
+
429
+ # ── 7. GuardedPySpell ────────────────────────────────────────────────────────
430
+
431
+ class GuardedPySpellNormalizer(Normalizer):
432
+ """PySpellChecker with guards to prevent over-correction.
433
+
434
+ PySpellChecker gets 88% on single_typo and 71% on multi_typo, but has
435
+ 40% over-correction on no-change queries (e.g. 'appl' → 'apple').
436
+
437
+ Guards:
438
+ - Skip tokens ≤ 4 chars (appl, npm, gcc, css, java, rust, echo, go)
439
+ - Skip all-uppercase tokens (AAPL, NYC, SQ — abbreviations/tickers)
440
+
441
+ Most legitimate short abbreviations are ≤ 4 chars or all-caps.
442
+ Typos worth correcting are almost always ≥ 5 chars ('wheather', 'suhsi').
443
+ """
444
+ name = "PySpell (guarded)"
445
+
446
+ def __init__(self):
447
+ if not HAS_PYSPELL:
448
+ raise RuntimeError("pyspellchecker not installed")
449
+ self._sc = _SC()
450
+
451
+ def _skip(self, token: str) -> bool:
452
+ return len(token) <= 4 or token.isupper()
453
+
454
+ def normalize(self, query: str) -> str:
455
+ words = query.lower().split()
456
+ return " ".join(
457
+ w if self._skip(w) else (self._sc.correction(w) or w)
458
+ for w in words
459
+ )
460
+
461
+
462
+ # ── 8. CombinedV2 (Rules + GuardedPySpell + RapidFuzz) ───────────────────────
463
+
464
+ class CombinedV2Normalizer(Normalizer):
465
+ """Improved pipeline: Rules → RapidFuzz (single-token) → SymSpell split → GuardedPySpell → RapidFuzz (multi-token).
466
+
467
+ Rules handles structured entities (flight IDs, stock tickers, product
468
+ spacing/order) with perfect precision. RapidFuzz runs first on single-token
469
+ queries to catch brand typos (bestbuyt→best buy) before SymSpell can corrupt
470
+ them (bestbuyt→best but). SymSpell compound splitting then handles concatenated
471
+ words (nearme→near me). GuardedPySpell handles general typos while protecting
472
+ short tokens. RapidFuzz runs again at the end for multi-token brand typos.
473
+ """
474
+ name = "CombinedV2 (Rules + GuardedPySpell + RapidFuzz)"
475
+
476
+ def __init__(self):
477
+ self._rules = RulesNormalizer()
478
+ self._symspell = SymSpellNormalizer() if HAS_SYMSPELL else None
479
+ self._pyspell = GuardedPySpellNormalizer() if HAS_PYSPELL else None
480
+ self._rfuzz = RapidFuzzNormalizer() if HAS_RAPIDFUZZ else None
481
+
482
+ def normalize(self, query: str) -> str:
483
+ q = query.strip()
484
+
485
+ # Step 1: Rules — flight IDs, stock tickers, product spacing/order
486
+ q_rules = self._rules.normalize(q)
487
+ if q_rules.lower() != q.lower():
488
+ return q_rules
489
+
490
+ # Step 2: RapidFuzz — brand name typos for single-token queries.
491
+ # Must run before SymSpell compound splitting: SymSpell splits 'bestbuyt'
492
+ # into 'best but' (wrong) whereas RapidFuzz correctly maps it to 'best buy'.
493
+ if self._rfuzz and ' ' not in q:
494
+ q_rf = self._rfuzz.normalize(q)
495
+ if q_rf.lower() != q.lower():
496
+ return q_rf
497
+
498
+ # Step 3: SymSpell compound splitting for single-token queries only.
499
+ # GuardedPySpell would corrupt 'nearme'→'name', 'newyork'→'network'.
500
+ # Only accept the SymSpell result if it actually introduces a space
501
+ # (i.e. it split the word rather than substituting a different word).
502
+ if self._symspell and ' ' not in q:
503
+ q_sym = self._symspell.normalize(q)
504
+ if ' ' in q_sym:
505
+ return q_sym
506
+
507
+ # Step 4: GuardedPySpell — general typos (skips short/uppercase tokens)
508
+ if self._pyspell:
509
+ q_spell = self._pyspell.normalize(q)
510
+ if q_spell.lower() != q.lower():
511
+ return q_spell
512
+
513
+ # Step 5: RapidFuzz — brand name typos for multi-token queries
514
+ # (e.g. 'gooogle maps' → 'google maps', 'spotifiy premium' → 'spotify premium')
515
+ if self._rfuzz:
516
+ q_rf = self._rfuzz.normalize(q)
517
+ if q_rf.lower() != q.lower():
518
+ return q_rf
519
+
520
+ return q
521
+
522
+
523
+ # ── 9. ContextualSpellCheck (spaCy + BERT) ───────────────────────────────────
524
+
525
+ class ContextualSpellCheckNormalizer(Normalizer):
526
+ """Uses BERT contextual embeddings to decide whether and how to correct
527
+ each token. Unlike SymSpell, it sees the full query context before
528
+ making a correction — so 'appl' in an ambiguous context stays as-is,
529
+ while 'wheather nyc' correctly becomes 'weather nyc'.
530
+
531
+ Requires:
532
+ pip install contextualSpellCheck
533
+ python -m spacy download en_core_web_sm
534
+ """
535
+ name = "ContextualSpellCheck (BERT)"
536
+
537
+ def __init__(self):
538
+ if not HAS_CONTEXTUAL:
539
+ raise RuntimeError("contextualSpellCheck not available")
540
+ self._nlp = _csc_nlp
541
+
542
+ def normalize(self, query: str) -> str:
543
+ doc = self._nlp(query)
544
+ # doc._.outcome_spellCheck is the full corrected string
545
+ result = doc._.outcome_spellCheck
546
+ return result if result else query
547
+
548
+
549
+ # ── 8. T5 Spell Corrector (HuggingFace) ──────────────────────────────────────
550
+
551
+ class T5SpellCorrector(Normalizer):
552
+ """Fine-tuned T5 model for spelling correction.
553
+ Model: oliverguhr/spelling-correction-english-base
554
+
555
+ This is a seq2seq model trained on noisy→clean sentence pairs.
556
+ It handles multi-token typos, word order, and spacing better than
557
+ dictionary-based approaches, but at significantly higher latency.
558
+
559
+ Expected latency: ~100–500ms on CPU, ~20–80ms on GPU.
560
+
561
+ Requires:
562
+ pip install transformers torch (or transformers sentencepiece)
563
+ """
564
+ name = "T5 (oliverguhr/spelling-correction)"
565
+
566
+ _MODEL_ID = "oliverguhr/spelling-correction-english-base"
567
+
568
+ def __init__(self):
569
+ if not HAS_TRANSFORMERS:
570
+ raise RuntimeError("transformers not installed")
571
+ self._pipe = None # lazy load in warmup()
572
+
573
+ def warmup(self) -> None:
574
+ print(f" Loading {self._MODEL_ID}...", end=" ", flush=True)
575
+ self._pipe = _hf_pipeline(
576
+ "text2text-generation",
577
+ model=self._MODEL_ID,
578
+ tokenizer=self._MODEL_ID,
579
+ )
580
+ # Prime the model with a dummy query
581
+ self._pipe("warmup query", max_length=64)
582
+ print("ready")
583
+
584
+ def normalize(self, query: str) -> str:
585
+ if self._pipe is None:
586
+ self.warmup()
587
+ result = self._pipe(query, max_length=128, num_beams=4)
588
+ return result[0]["generated_text"].strip()
589
+
590
+
591
+ # ── 9. CombinedML (Rules → T5) ───────────────────────────────────────────────
592
+
593
+ class CombinedMLNormalizer(Normalizer):
594
+ """Best-of-both-worlds pipeline:
595
+ 1. Rules handle structured entity normalization (flight IDs, stock tickers,
596
+ product model reordering) with zero latency and perfect precision.
597
+ 2. T5 handles everything else — general typos, multi-token corrections,
598
+ brand names — using full-query context.
599
+
600
+ This avoids running T5 on queries that rules already handle perfectly,
601
+ saving latency on the most common structured patterns.
602
+ """
603
+ name = "CombinedML (Rules → T5)"
604
+
605
+ def __init__(self):
606
+ self._rules = RulesNormalizer()
607
+ self._t5 = T5SpellCorrector() if HAS_TRANSFORMERS else None
608
+
609
+ def warmup(self) -> None:
610
+ if self._t5:
611
+ self._t5.warmup()
612
+
613
+ def normalize(self, query: str) -> str:
614
+ # Step 1: Rules first — highest precision for structured entities
615
+ q_rules = self._rules.normalize(query)
616
+ if q_rules.lower() != query.lower():
617
+ return q_rules
618
+
619
+ # Step 2: T5 for everything else
620
+ if self._t5:
621
+ return self._t5.normalize(query)
622
+
623
+ return query
624
+
625
+
626
+ # ── Metrics ───────────────────────────────────────────────────────────────────
627
+
628
+ def char_error_rate(pred: str, gold: str) -> float:
629
+ """CER = edit_distance / max(len(pred), len(gold))."""
630
+ if not pred and not gold:
631
+ return 0.0
632
+ return edit_distance(pred.lower(), gold.lower()) / max(len(pred), len(gold))
633
+
634
+
635
+ def word_error_rate(pred: str, gold: str) -> float:
636
+ """WER = token-level edit distance / number of gold tokens."""
637
+ pred_toks = pred.lower().split()
638
+ gold_toks = gold.lower().split()
639
+ if not gold_toks:
640
+ return 0.0
641
+ m, n = len(pred_toks), len(gold_toks)
642
+ dp = list(range(n + 1))
643
+ for i in range(1, m + 1):
644
+ prev = dp[:]
645
+ dp[0] = i
646
+ for j in range(1, n + 1):
647
+ dp[j] = prev[j-1] if pred_toks[i-1] == gold_toks[j-1] \
648
+ else 1 + min(prev[j], dp[j-1], prev[j-1])
649
+ return dp[n] / n
650
+
651
+
652
+ def run_benchmark(normalizer: Normalizer, df: pd.DataFrame, n_timing_reps: int = 5) -> dict:
653
+ """Run a normalizer on the dataset and return metrics."""
654
+ queries = df["noisy"].tolist()
655
+
656
+ # ── Timing ───────────────────────────────────────────────────────────────
657
+ latencies_ms = []
658
+ for q in queries:
659
+ t0 = time.perf_counter()
660
+ for _ in range(n_timing_reps):
661
+ normalizer.normalize(q)
662
+ t1 = time.perf_counter()
663
+ latencies_ms.append((t1 - t0) / n_timing_reps * 1000)
664
+
665
+ # ── Predictions ──────────────────────────────────────────────────────────
666
+ preds = [normalizer.normalize(q) for q in queries]
667
+ df = df.copy()
668
+ df["pred"] = preds
669
+
670
+ def em(row): return row["pred"].lower().strip() == row["canonical"].lower().strip()
671
+ def cer(row): return char_error_rate(row["pred"], row["canonical"])
672
+ def wer(row): return word_error_rate(row["pred"], row["canonical"])
673
+
674
+ df["em"] = df.apply(em, axis=1)
675
+ df["cer"] = df.apply(cer, axis=1)
676
+ df["wer"] = df.apply(wer, axis=1)
677
+
678
+ # No-change precision and over-correction rate
679
+ nc = df[~df["should_change"]]
680
+ no_change_precision = (nc["pred"].str.lower().str.strip() == nc["noisy"].str.lower().str.strip()).mean() if len(nc) else float("nan")
681
+ over_correction = 1.0 - no_change_precision if not np.isnan(no_change_precision) else float("nan")
682
+
683
+ # ── Per-category exact match ──────────────────────────────────────────────
684
+ cat_em = df.groupby("category")["em"].mean().to_dict()
685
+
686
+ return {
687
+ "name": normalizer.name,
688
+ "exact_match": df["em"].mean(),
689
+ "cer_mean": df["cer"].mean(),
690
+ "wer_mean": df["wer"].mean(),
691
+ "no_change_precision": no_change_precision,
692
+ "over_correction": over_correction,
693
+ "latency_mean_ms": np.mean(latencies_ms),
694
+ "latency_p50_ms": np.percentile(latencies_ms, 50),
695
+ "latency_p95_ms": np.percentile(latencies_ms, 95),
696
+ "latency_p99_ms": np.percentile(latencies_ms, 99),
697
+ "per_category": cat_em,
698
+ "_df": df, # store for detailed output
699
+ "_latencies": latencies_ms,
700
+ }
701
+
702
+
703
+ # ── Main ──────────────────────────────────────────────────────────────────────
704
+
705
+ def main():
706
+ parser = argparse.ArgumentParser()
707
+ parser.add_argument("--dataset", default=str(Path(__file__).parent / "dataset.csv"))
708
+ parser.add_argument("--reps", type=int, default=5, help="Timing repetitions per query")
709
+ args = parser.parse_args()
710
+
711
+ df = pd.read_csv(args.dataset)
712
+ print(f"Loaded {len(df)} rows from {args.dataset}")
713
+ print(f"Categories: {df['category'].value_counts().to_dict()}\n")
714
+
715
+ # ── Build normalizer list ─────────────────────────────────────────────────
716
+ normalizers: list[Normalizer] = [IdentityNormalizer(), RulesNormalizer()]
717
+ if HAS_PYSPELL:
718
+ normalizers.append(PySpellNormalizer())
719
+ if HAS_SYMSPELL:
720
+ normalizers.append(SymSpellNormalizer())
721
+ if HAS_RAPIDFUZZ:
722
+ normalizers.append(RapidFuzzNormalizer())
723
+ if HAS_SYMSPELL and HAS_RAPIDFUZZ:
724
+ normalizers.append(CombinedNormalizer())
725
+ if HAS_PYSPELL:
726
+ normalizers.append(GuardedPySpellNormalizer())
727
+ if HAS_PYSPELL and HAS_RAPIDFUZZ:
728
+ normalizers.append(CombinedV2Normalizer())
729
+ # ML normalizers (disabled — too slow and underperform rules-based)
730
+ # if HAS_CONTEXTUAL:
731
+ # normalizers.append(ContextualSpellCheckNormalizer())
732
+ # if HAS_TRANSFORMERS:
733
+ # normalizers.append(T5SpellCorrector())
734
+ # normalizers.append(CombinedMLNormalizer())
735
+
736
+ # Warmup
737
+ for norm in normalizers:
738
+ norm.warmup()
739
+
740
+ # ── Run benchmarks ────────────────────────────────────────────────────────
741
+ results = []
742
+ for norm in normalizers:
743
+ print(f"Benchmarking: {norm.name}...", end=" ", flush=True)
744
+ r = run_benchmark(norm, df, n_timing_reps=args.reps)
745
+ results.append(r)
746
+ print(f"EM={r['exact_match']:.1%} CER={r['cer_mean']:.3f} lat_p50={r['latency_p50_ms']:.2f}ms")
747
+
748
+ # ── Summary table ─────────────────────────────────────────────────────────
749
+ print("\n" + "="*90)
750
+ print("SUMMARY — Overall Metrics")
751
+ print("="*90)
752
+
753
+ summary_rows = []
754
+ for r in results:
755
+ summary_rows.append({
756
+ "Normalizer": r["name"],
757
+ "Exact Match": f"{r['exact_match']:.1%}",
758
+ "CER": f"{r['cer_mean']:.3f}",
759
+ "WER": f"{r['wer_mean']:.3f}",
760
+ "No-change Prec.": f"{r['no_change_precision']:.1%}" if not np.isnan(r['no_change_precision']) else "N/A",
761
+ "Over-correction": f"{r['over_correction']:.1%}" if not np.isnan(r['over_correction']) else "N/A",
762
+ "Lat mean (ms)": f"{r['latency_mean_ms']:.2f}",
763
+ "Lat p50 (ms)": f"{r['latency_p50_ms']:.2f}",
764
+ "Lat p95 (ms)": f"{r['latency_p95_ms']:.2f}",
765
+ "Lat p99 (ms)": f"{r['latency_p99_ms']:.2f}",
766
+ })
767
+
768
+ try:
769
+ from tabulate import tabulate
770
+ print(tabulate(summary_rows, headers="keys", tablefmt="rounded_outline"))
771
+ except ImportError:
772
+ pd.DataFrame(summary_rows).to_string(index=False)
773
+ print(pd.DataFrame(summary_rows).to_string(index=False))
774
+
775
+ # ── Per-category table ────────────────────────────────────────────────────
776
+ categories = sorted(df["category"].unique())
777
+ print("\n" + "="*90)
778
+ print("PER-CATEGORY Exact Match")
779
+ print("="*90)
780
+
781
+ cat_rows = []
782
+ for r in results:
783
+ row = {"Normalizer": r["name"][:30]}
784
+ for cat in categories:
785
+ row[cat] = f"{r['per_category'].get(cat, float('nan')):.0%}"
786
+ cat_rows.append(row)
787
+
788
+ try:
789
+ from tabulate import tabulate
790
+ print(tabulate(cat_rows, headers="keys", tablefmt="rounded_outline"))
791
+ except ImportError:
792
+ print(pd.DataFrame(cat_rows).to_string(index=False))
793
+
794
+ # ── Sample predictions ────────────────────────────────────────────────────
795
+ print("\n" + "="*90)
796
+ print("SAMPLE PREDICTIONS — Combined vs Identity (first 5 per category)")
797
+ print("="*90)
798
+
799
+ combined_r = next((r for r in results if "CombinedV2" in r["name"]),
800
+ next((r for r in results if "Combined" in r["name"]), results[-1]))
801
+ identity_r = results[0]
802
+
803
+ for cat in categories:
804
+ sub = combined_r["_df"][combined_r["_df"]["category"] == cat].head(5)
805
+ id_sub = identity_r["_df"][identity_r["_df"]["category"] == cat].head(5)
806
+ print(f"\n {cat.upper()}")
807
+ print(f" {'Noisy':<30} {'Canonical':<25} {'Combined pred':<25} {'EM':>4}")
808
+ print(f" {'-'*30} {'-'*25} {'-'*25} {'-'*4}")
809
+ for (_, row), (_, id_row) in zip(sub.iterrows(), id_sub.iterrows()):
810
+ em_mark = "✓" if row["em"] else "✗"
811
+ print(f" {row['noisy']:<30} {row['canonical']:<25} {row['pred']:<25} {em_mark:>4}")
812
+
813
+ # ── Save full results ─────────────────────────────────────────────────────
814
+ out_path = Path(args.dataset).parent / "results.csv"
815
+ combined_r["_df"].to_csv(out_path, index=False)
816
+ print(f"\nFull predictions saved to {out_path}")
817
+
818
+
819
+ if __name__ == "__main__":
820
+ main()
dataset.csv ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ noisy,canonical,category,should_change
2
+ wheather nyc,weather nyc,single_typo,True
3
+ calclator,calculator,single_typo,True
4
+ forcast london,forecast london,single_typo,True
5
+ temprature converter,temperature converter,single_typo,True
6
+ restaurent near me,restaurant near me,single_typo,True
7
+ translater english,translator english,single_typo,True
8
+ defintion of entropy,definition of entropy,single_typo,True
9
+ seperate the words,separate the words,single_typo,True
10
+ accomodation paris,accommodation paris,single_typo,True
11
+ recieve email,receive email,single_typo,True
12
+ suhsi near me,sushi near me,multi_typo,True
13
+ restarant near me,restaurant near me,multi_typo,True
14
+ wether forcast today,weather forecast today,multi_typo,True
15
+ plmber emergancy,plumber emergency,multi_typo,True
16
+ nearist cofee shop,nearest coffee shop,multi_typo,True
17
+ cheep flihgts paris,cheap flights paris,multi_typo,True
18
+ hotl delas nyc,hotel deals nyc,multi_typo,True
19
+ hosptial emergancy rm,hospital emergency room,multi_typo,True
20
+ bestbuyt,best buy,brand_typo,True
21
+ youtueb,youtube,brand_typo,True
22
+ gooogle maps,google maps,brand_typo,True
23
+ amazom prime,amazon prime,brand_typo,True
24
+ netflx login,netflix login,brand_typo,True
25
+ spotifiy premium,spotify premium,brand_typo,True
26
+ facbook login,facebook login,brand_typo,True
27
+ instagrem,instagram,brand_typo,True
28
+ linkdin profile,linkedin profile,brand_typo,True
29
+ gitub repo,github repo,brand_typo,True
30
+ 163 SQ,SQ163,flight_order,True
31
+ 100 AA,AA100,flight_order,True
32
+ 417 BA,BA417,flight_order,True
33
+ SQ 163,SQ163,flight_order,True
34
+ AA 100,AA100,flight_order,True
35
+ 815 DL,DL815,flight_order,True
36
+ 200 UA,UA200,flight_order,True
37
+ flight 163 SQ,SQ163,flight_order,True
38
+ AA flight 100,AA100,flight_order,True
39
+ 15 iphone,iphone 15,product_order,True
40
+ pro 14 macbook,macbook pro 14,product_order,True
41
+ s24 samsung,samsung s24,product_order,True
42
+ ultra s23 samsung,samsung s23 ultra,product_order,True
43
+ air 13 macbook,macbook air 13,product_order,True
44
+ pro ipad 12,ipad pro 12,product_order,True
45
+ max pro 15 iphone,iphone 15 pro max,product_order,True
46
+ pixel 8 google,google pixel 8,product_order,True
47
+ tab s9 samsung,samsung tab s9,product_order,True
48
+ AAPL stock,AAPL,stock_canon,True
49
+ stock TSLA,TSLA,stock_canon,True
50
+ apple aapl,AAPL,stock_canon,True
51
+ tesla stock price,TSLA,stock_canon,True
52
+ MSFT share price,MSFT,stock_canon,True
53
+ google stock GOOGL,GOOGL,stock_canon,True
54
+ amazon AMZN stock,AMZN,stock_canon,True
55
+ meta stock FB,META,stock_canon,True
56
+ nvda share price,NVDA,stock_canon,True
57
+ iphone15,iphone 15,spacing,True
58
+ macbookpro,macbook pro,spacing,True
59
+ nearme,near me,spacing,True
60
+ bestbuy,best buy,spacing,True
61
+ newyork,new york,spacing,True
62
+ unitedstates,united states,spacing,True
63
+ wifi password,wifi password,spacing,False
64
+ hotdog,hotdog,spacing,False
65
+ appl,appl,no_change,False
66
+ rust,rust,no_change,False
67
+ delta,delta,no_change,False
68
+ apple,apple,no_change,False
69
+ python,python,no_change,False
70
+ java,java,no_change,False
71
+ echo,echo,no_change,False
72
+ spring,spring,no_change,False
73
+ cloud,cloud,no_change,False
74
+ mercury,mercury,no_change,False
75
+ npm,npm,no_change,False
76
+ gcc,gcc,no_change,False
77
+ css,css,no_change,False
78
+ go,go,no_change,False
79
+ swift,swift,no_change,False
80
+ waether forecast tomorrow,weather forecast tomorrow,single_typo,True
81
+ best pizzeria near me,best pizzeria near me,single_typo,False
82
+ how to cook pasta,how to cook pasta,single_typo,False
83
+ gas stations nearby,gas stations nearby,single_typo,False
84
+ resturant reservations online,restaurant reservations online,single_typo,True
85
+ puplic libraries near me,public libraries near me,single_typo,True
86
+ best plumber in my area,best plumber in my area,single_typo,False
87
+ forcast weekend weather,forecast weekend weather,single_typo,True
88
+ how to pronounce worcester,how to pronounce worcester,single_typo,False
89
+ recipie for chocolate cake,recipe for chocolate cake,single_typo,True
90
+ hardware store locator,hardware store locator,single_typo,False
91
+ trafic conditions now,traffic conditions now,single_typo,True
92
+ vacuum cleaner reviews,vacuum cleaner reviews,single_typo,False
93
+ how to spell occassion,how to spell occasion,single_typo,True
94
+ dentist appointements available,dentist appointments available,single_typo,True
95
+ nearest pharmacy open now,nearest pharmacy open now,single_typo,False
96
+ beginner gardening tips,beginner gardening tips,single_typo,False
97
+ calorie counter app,calorie counter app,single_typo,False
98
+ what is the defintion of serendipity,what is the definition of serendipity,single_typo,True
99
+ best electrician in town,best electrician in town,single_typo,False
100
+ humidty levels today,humidity levels today,single_typo,True
101
+ directions to airport,directions to airport,single_typo,False
102
+ how to spel definitely,how to spell definitely,single_typo,True
103
+ grocery stores near me,grocery stores near me,single_typo,False
104
+ buisness hours for target,business hours for target,single_typo,True
105
+ weather in chicago tommorow,weather in chicago tomorrow,single_typo,True
106
+ how to make omlette,how to make omelette,single_typo,True
107
+ atm machine locations,atm machine locations,single_typo,False
108
+ barber shop avaialble,barber shop available,single_typo,True
109
+ best restauarant in boston,best restaurant in boston,single_typo,True
110
+ how to cook brocoli,how to cook broccoli,single_typo,True
111
+ swimming pools near me,swimming pools near me,single_typo,False
112
+ seperate the documents,separate the documents,single_typo,True
113
+ temperture in fahrenheit,temperature in fahrenheit,single_typo,True
114
+ parking garage nearby,parking garage nearby,single_typo,False
115
+ how to make lasagna recepie,how to make lasagna recipe,single_typo,True
116
+ veterinary clinic hours,veterinary clinic hours,single_typo,False
117
+ what does recieve mean,what does receive mean,single_typo,True
118
+ yoga classes availible now,yoga classes available now,single_typo,True
119
+ begininng spanish lessons,beginning spanish lessons,single_typo,True
120
+ plumber near me emergancy,plumber near me emergency,multi_typo,True
121
+ dentist appoitment availble,dentist appointment available,multi_typo,True
122
+ electritian repaire servises,electrician repair services,multi_typo,True
123
+ weather forcast this weakend,weather forecast this weekend,multi_typo,True
124
+ neerest gass station,nearest gas station,multi_typo,True
125
+ hotel reservation cheep rates,hotel reservation cheap rates,multi_typo,True
126
+ autombile mechanic lokation,automobile mechanic location,multi_typo,True
127
+ humidty forcast tomorow,humidity forecast tomorrow,multi_typo,True
128
+ locksmith emergancy servise,locksmith emergency service,multi_typo,True
129
+ flight ticket prices comparision,flight ticket prices comparison,multi_typo,True
130
+ restauant reservaton opem,restaurant reservation open,multi_typo,True
131
+ carpentor contrator estimat,carpenter contractor estimate,multi_typo,True
132
+ tempreture alert wheather,temperature alert weather,multi_typo,True
133
+ tourist atraction guidebook,tourist attraction guidebook,multi_typo,True
134
+ laundry servise neerby,laundry service nearby,multi_typo,True
135
+ vehcile registation renewel,vehicle registration renewal,multi_typo,True
136
+ snowstrom warning forcast,snowstorm warning forecast,multi_typo,True
137
+ hostotel accomodation deals,hostel accommodation deals,multi_typo,True
138
+ haircut appoitment schedul,haircut appointment schedule,multi_typo,True
139
+ sunrise time locaton,sunrise time location,multi_typo,True
140
+ moving compeny quoate,moving company quote,multi_typo,True
141
+ road conidtion trafic update,road condition traffic update,multi_typo,True
142
+ veterinarian emergancy clinc,veterinarian emergency clinic,multi_typo,True
143
+ vacation packge discunt availble,vacation package discount available,multi_typo,True
144
+ pest contral servise lokation,pest control service location,multi_typo,True
145
+ pollin forcast alergy,pollen forecast allergy,multi_typo,True
146
+ airbnb accomodaton recomendation,airbnb accommodation recommendation,multi_typo,True
147
+ window cleening compny rates,window cleaning company rates,multi_typo,True
148
+ wind gust wheather alert,wind gust weather alert,multi_typo,True
149
+ rentral car comparision price,rental car comparison price,multi_typo,True
150
+ goggle.com,google.com,brand_typo,True
151
+ amazn.com,amazon.com,brand_typo,True
152
+ spotfiy music,spotify music,brand_typo,True
153
+ instgram app,instagram app,brand_typo,True
154
+ gitub profile,github profile,brand_typo,True
155
+ redditt.com,reddit.com,brand_typo,True
156
+ twiter feed,twitter feed,brand_typo,True
157
+ linkdin jobs,linkedin jobs,brand_typo,True
158
+ microsodt office,microsoft office,brand_typo,True
159
+ adoobe creative,adobe creative,brand_typo,True
160
+ dropbx files,dropbox files,brand_typo,True
161
+ zom meeting,zoom meeting,brand_typo,True
162
+ slck workspace,slack workspace,brand_typo,True
163
+ paypa checkout,paypal checkout,brand_typo,True
164
+ ebya auction,ebay auction,brand_typo,True
165
+ wallmart groceries,walmart groceries,brand_typo,True
166
+ targat deals,target deals,brand_typo,True
167
+ nytimez news,nytimes news,brand_typo,True
168
+ bbc.co.uk,bbc.com,brand_typo,True
169
+ cnn breaking,cnn breaking news,brand_typo,True
170
+ youtub video,youtube video,brand_typo,True
171
+ netflic series,netflix series,brand_typo,True
172
+ googl drive,google drive,brand_typo,True
173
+ amzon shopping,amazon shopping,brand_typo,True
174
+ spotiffy playlist,spotify playlist,brand_typo,True
175
+ facebk messenger,facebook messenger,brand_typo,True
176
+ insta stories,instagram stories,brand_typo,True
177
+ gihub code,github code,brand_typo,True
178
+ reddot forum,reddit forum,brand_typo,True
179
+ BA 287,BA287,flight_order,True
180
+ 502 DL,DL502,flight_order,True
181
+ flight UA 441,UA441,flight_order,True
182
+ lh 156,LH156,flight_order,True
183
+ 273 AF,AF273,flight_order,True
184
+ EK 89,EK89,flight_order,True
185
+ 621qr,QR621,flight_order,True
186
+ CX 884,CX884,flight_order,True
187
+ 345 vs,VS345,flight_order,True
188
+ KL 714,KL714,flight_order,True
189
+ 193ib,IB193,flight_order,True
190
+ TK 427,TK427,flight_order,True
191
+ flight 556 AA,AA556,flight_order,True
192
+ 738 ba,BA738,flight_order,True
193
+ DL 212,DL212,flight_order,True
194
+ 84ua,UA84,flight_order,True
195
+ AF 609,AF609,flight_order,True
196
+ 445 ek,EK445,flight_order,True
197
+ sq 267,SQ267,flight_order,True
198
+ 572 CX,CX572,flight_order,True
199
+ flight vs314,VS314,flight_order,True
200
+ 981 KL,KL981,flight_order,True
201
+ IB 456,IB456,flight_order,True
202
+ tk 103,TK103,flight_order,True
203
+ 890 SQ,SQ890,flight_order,True
204
+ 13 air macbook,macbook air 13,product_order,True
205
+ pro iphone 15,iphone 15 pro,product_order,True
206
+ 8 pixel google,google pixel 8,product_order,True
207
+ ultra 24 s23 samsung,samsung s23 ultra,product_order,True
208
+ fold 5 samsung galaxy,samsung galaxy fold 5,product_order,True
209
+ max 14 pro iphone,iphone 14 pro max,product_order,True
210
+ 16 macbook pro,macbook pro 16,product_order,True
211
+ pixel pro 7 google,google pixel 7 pro,product_order,True
212
+ tab s9 samsung galaxy,samsung galaxy tab s9,product_order,True
213
+ 12 mini iphone,iphone 12 mini,product_order,True
214
+ z fold 4 samsung,samsung galaxy z fold 4,product_order,True
215
+ watch series 9 apple,apple watch series 9,product_order,True
216
+ xl pixel 8 google,google pixel 8 xl,product_order,True
217
+ s24 ultra samsung,samsung s24 ultra,product_order,True
218
+ 15 macbook air,macbook air 15,product_order,True
219
+ iphone pro 13,iphone 13 pro,product_order,True
220
+ flip 5 z samsung,samsung galaxy z flip 5,product_order,True
221
+ 7 series watch apple,apple watch series 7,product_order,True
222
+ a15 oneplus,oneplus a15,product_order,True
223
+ pad air 11 ipad,ipad air 11,product_order,True
224
+ ultra 15 iphone pro,iphone 15 pro max,product_order,True
225
+ note 24 galaxy samsung,samsung galaxy note 24,product_order,True
226
+ 11 pro max iphone,iphone 11 pro max,product_order,True
227
+ studio display apple,apple studio display,product_order,True
228
+ x1 carbon lenovo thinkpad,lenovo thinkpad x1 carbon,product_order,True
229
+ AAPL stock price,AAPL,stock_canon,True
230
+ tesla share price,TSLA,stock_canon,True
231
+ MSFT earnings,MSFT,stock_canon,True
232
+ google GOOGL stock,GOOGL,stock_canon,True
233
+ AMZN share,AMZN,stock_canon,True
234
+ amazon price AMZN,AMZN,stock_canon,True
235
+ META stock price,META,stock_canon,True
236
+ nvidia NVDA,NVDA,stock_canon,True
237
+ NFLX share price,NFLX,stock_canon,True
238
+ netflix stock NFLX,NFLX,stock_canon,True
239
+ PYPL price,PYPL,stock_canon,True
240
+ paypal PYPL stock,PYPL,stock_canon,True
241
+ SNAP stock,SNAP,stock_canon,True
242
+ snapchat SNAP,SNAP,stock_canon,True
243
+ AMD share price,AMD,stock_canon,True
244
+ amd processor stock,AMD,stock_canon,True
245
+ INTC earnings,INTC,stock_canon,True
246
+ intel INTC stock,INTC,stock_canon,True
247
+ QCOM price,QCOM,stock_canon,True
248
+ qualcomm QCOM,QCOM,stock_canon,True
249
+ UBER stock price,UBER,stock_canon,True
250
+ lyft LYFT share,LYFT,stock_canon,True
251
+ airbnb ABNB stock,ABNB,stock_canon,True
252
+ iphone15pro,iphone 15 pro,spacing,True
253
+ samsungz9,samsung z9,spacing,True
254
+ ipadair,ipad air,spacing,True
255
+ losangeles,los angeles,spacing,True
256
+ sanfrancisco,san francisco,spacing,True
257
+ nearbyshops,nearby shops,spacing,True
258
+ dellxps13,dell xps 13,spacing,True
259
+ surfacelaptopp5,surface laptop p5,spacing,True
260
+ newyorkpizza,new york pizza,spacing,True
261
+ holmescompany,holmes company,spacing,True
262
+ findme,find me,spacing,True
263
+ bostonma,boston ma,spacing,True
264
+ sanjose,san jose,spacing,True
265
+ pixelwatch2,pixel watch 2,spacing,True
266
+ openpizza,open pizza,spacing,True
267
+ northcarolina,north carolina,spacing,True
268
+ showevenear,show venues near,spacing,True
269
+ galax30series,galax 30 series,spacing,True
270
+ laptop,laptop,spacing,False
271
+ smartphone,smartphone,spacing,False
272
+ keyboard,keyboard,spacing,False
273
+ monitor,monitor,spacing,False
274
+ NYC,NYC,no_change,False
275
+ LA,LA,no_change,False
276
+ UK,UK,no_change,False
277
+ vue,vue,no_change,False
278
+ aws,aws,no_change,False
279
+ sql,sql,no_change,False
280
+ git,git,no_change,False
281
+ c,c,no_change,False
282
+ x,x,no_change,False
283
+ r,r,no_change,False
284
+ z,z,no_change,False
285
+ kafka,kafka,no_change,False
286
+ nginx,nginx,no_change,False
287
+ vim,vim,no_change,False
288
+ pdf,pdf,no_change,False
289
+ xml,xml,no_change,False
290
+ svg,svg,no_change,False
291
+ gcp,gcp,no_change,False
292
+ cli,cli,no_change,False
293
+ api,api,no_change,False
294
+ jwt,jwt,no_change,False
295
+ mvp,mvp,no_change,False
296
+ gdpr,gdpr,no_change,False
297
+ crm,crm,no_change,False
298
+ ux,ux,no_change,False
299
+ pwa,pwa,no_change,False
300
+ orm,orm,no_change,False
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
1
+ streamlit>=1.32.0
2
  pandas
3
+ symspellpy
4
+ rapidfuzz
5
+ python-Levenshtein
6
+ pyspellchecker
results.csv ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ noisy,canonical,category,should_change,pred,em,cer,wer
2
+ wheather nyc,weather nyc,single_typo,True,whether nyc,False,0.18181818181818182,0.5
3
+ calclator,calculator,single_typo,True,calculator,True,0.0,0.0
4
+ forcast london,forecast london,single_typo,True,forecast london,True,0.0,0.0
5
+ temprature converter,temperature converter,single_typo,True,temperature converter,True,0.0,0.0
6
+ restaurent near me,restaurant near me,single_typo,True,restaurant near me,True,0.0,0.0
7
+ translater english,translator english,single_typo,True,translate english,False,0.1111111111111111,0.5
8
+ defintion of entropy,definition of entropy,single_typo,True,definition of entropy,True,0.0,0.0
9
+ seperate the words,separate the words,single_typo,True,separate the words,True,0.0,0.0
10
+ accomodation paris,accommodation paris,single_typo,True,accommodation paris,True,0.0,0.0
11
+ recieve email,receive email,single_typo,True,receive email,True,0.0,0.0
12
+ suhsi near me,sushi near me,multi_typo,True,sushi near me,True,0.0,0.0
13
+ restarant near me,restaurant near me,multi_typo,True,restaurant near me,True,0.0,0.0
14
+ wether forcast today,weather forecast today,multi_typo,True,wether forecast today,False,0.045454545454545456,0.3333333333333333
15
+ plmber emergancy,plumber emergency,multi_typo,True,plumber emergency,True,0.0,0.0
16
+ nearist cofee shop,nearest coffee shop,multi_typo,True,nearest coffee shop,True,0.0,0.0
17
+ cheep flihgts paris,cheap flights paris,multi_typo,True,cheep flights paris,False,0.05263157894736842,0.3333333333333333
18
+ hotl delas nyc,hotel deals nyc,multi_typo,True,hotl delay nyc,False,0.26666666666666666,0.6666666666666666
19
+ hosptial emergancy rm,hospital emergency room,multi_typo,True,hospital emergency rm,False,0.08695652173913043,0.3333333333333333
20
+ bestbuyt,best buy,brand_typo,True,best buy,True,0.0,0.0
21
+ youtueb,youtube,brand_typo,True,youtube,True,0.0,0.0
22
+ gooogle maps,google maps,brand_typo,True,google maps,True,0.0,0.0
23
+ amazom prime,amazon prime,brand_typo,True,amazon prime,True,0.0,0.0
24
+ netflx login,netflix login,brand_typo,True,netflix login,True,0.0,0.0
25
+ spotifiy premium,spotify premium,brand_typo,True,spotifiy premium,False,0.0625,0.5
26
+ facbook login,facebook login,brand_typo,True,facebook login,True,0.0,0.0
27
+ instagrem,instagram,brand_typo,True,instagram,True,0.0,0.0
28
+ linkdin profile,linkedin profile,brand_typo,True,linking profile,False,0.1875,0.5
29
+ gitub repo,github repo,brand_typo,True,tub repo,False,0.2727272727272727,0.5
30
+ 163 SQ,SQ163,flight_order,True,SQ163,True,0.0,0.0
31
+ 100 AA,AA100,flight_order,True,AA100,True,0.0,0.0
32
+ 417 BA,BA417,flight_order,True,BA417,True,0.0,0.0
33
+ SQ 163,SQ163,flight_order,True,SQ163,True,0.0,0.0
34
+ AA 100,AA100,flight_order,True,AA100,True,0.0,0.0
35
+ 815 DL,DL815,flight_order,True,DL815,True,0.0,0.0
36
+ 200 UA,UA200,flight_order,True,UA200,True,0.0,0.0
37
+ flight 163 SQ,SQ163,flight_order,True,SQ163,True,0.0,0.0
38
+ AA flight 100,AA100,flight_order,True,AA flight 100,False,0.6153846153846154,3.0
39
+ 15 iphone,iphone 15,product_order,True,iphone 15,True,0.0,0.0
40
+ pro 14 macbook,macbook pro 14,product_order,True,macbook pro 14,True,0.0,0.0
41
+ s24 samsung,samsung s24,product_order,True,samsung s24,True,0.0,0.0
42
+ ultra s23 samsung,samsung s23 ultra,product_order,True,samsung ultra s23,False,0.47058823529411764,0.6666666666666666
43
+ air 13 macbook,macbook air 13,product_order,True,macbook air 13,True,0.0,0.0
44
+ pro ipad 12,ipad pro 12,product_order,True,ipad pro 12,True,0.0,0.0
45
+ max pro 15 iphone,iphone 15 pro max,product_order,True,iphone max pro 15,False,0.35294117647058826,0.5
46
+ pixel 8 google,google pixel 8,product_order,True,pixel 8 google,False,0.8571428571428571,0.6666666666666666
47
+ tab s9 samsung,samsung tab s9,product_order,True,samsung tab s9,True,0.0,0.0
48
+ AAPL stock,AAPL,stock_canon,True,AAPL,True,0.0,0.0
49
+ stock TSLA,TSLA,stock_canon,True,TSLA,True,0.0,0.0
50
+ apple aapl,AAPL,stock_canon,True,AAPL,True,0.0,0.0
51
+ tesla stock price,TSLA,stock_canon,True,TSLA,True,0.0,0.0
52
+ MSFT share price,MSFT,stock_canon,True,MSFT,True,0.0,0.0
53
+ google stock GOOGL,GOOGL,stock_canon,True,GOOGL,True,0.0,0.0
54
+ amazon AMZN stock,AMZN,stock_canon,True,AMZN,True,0.0,0.0
55
+ meta stock FB,META,stock_canon,True,META,True,0.0,0.0
56
+ nvda share price,NVDA,stock_canon,True,NVDA,True,0.0,0.0
57
+ iphone15,iphone 15,spacing,True,iphone 15,True,0.0,0.0
58
+ macbookpro,macbook pro,spacing,True,macbook pro,True,0.0,0.0
59
+ nearme,near me,spacing,True,near me,True,0.0,0.0
60
+ bestbuy,best buy,spacing,True,best buy,True,0.0,0.0
61
+ newyork,new york,spacing,True,new york,True,0.0,0.0
62
+ unitedstates,united states,spacing,True,united states,True,0.0,0.0
63
+ wifi password,wifi password,spacing,False,wifi password,True,0.0,0.0
64
+ hotdog,hotdog,spacing,False,hotdog,True,0.0,0.0
65
+ appl,appl,no_change,False,appl,True,0.0,0.0
66
+ rust,rust,no_change,False,rust,True,0.0,0.0
67
+ delta,delta,no_change,False,delta,True,0.0,0.0
68
+ apple,apple,no_change,False,apple,True,0.0,0.0
69
+ python,python,no_change,False,python,True,0.0,0.0
70
+ java,java,no_change,False,java,True,0.0,0.0
71
+ echo,echo,no_change,False,echo,True,0.0,0.0
72
+ spring,spring,no_change,False,spring,True,0.0,0.0
73
+ cloud,cloud,no_change,False,cloud,True,0.0,0.0
74
+ mercury,mercury,no_change,False,mercury,True,0.0,0.0
75
+ npm,npm,no_change,False,npm,True,0.0,0.0
76
+ gcc,gcc,no_change,False,gcc,True,0.0,0.0
77
+ css,css,no_change,False,css,True,0.0,0.0
78
+ go,go,no_change,False,go,True,0.0,0.0
79
+ swift,swift,no_change,False,swift,True,0.0,0.0
80
+ waether forecast tomorrow,weather forecast tomorrow,single_typo,True,whether forecast tomorrow,False,0.08,0.3333333333333333
81
+ best pizzeria near me,best pizzeria near me,single_typo,False,best pizzeria near me,True,0.0,0.0
82
+ how to cook pasta,how to cook pasta,single_typo,False,how to cook pasta,True,0.0,0.0
83
+ gas stations nearby,gas stations nearby,single_typo,False,gas stations nearby,True,0.0,0.0
84
+ resturant reservations online,restaurant reservations online,single_typo,True,restaurant reservations online,True,0.0,0.0
85
+ puplic libraries near me,public libraries near me,single_typo,True,public libraries near me,True,0.0,0.0
86
+ best plumber in my area,best plumber in my area,single_typo,False,best plumber in my area,True,0.0,0.0
87
+ forcast weekend weather,forecast weekend weather,single_typo,True,forecast weekend weather,True,0.0,0.0
88
+ how to pronounce worcester,how to pronounce worcester,single_typo,False,how to pronounce worcester,True,0.0,0.0
89
+ recipie for chocolate cake,recipe for chocolate cake,single_typo,True,recipe for chocolate cake,True,0.0,0.0
90
+ hardware store locator,hardware store locator,single_typo,False,hardware store locator,True,0.0,0.0
91
+ trafic conditions now,traffic conditions now,single_typo,True,traffic conditions now,True,0.0,0.0
92
+ vacuum cleaner reviews,vacuum cleaner reviews,single_typo,False,vacuum cleaner reviews,True,0.0,0.0
93
+ how to spell occassion,how to spell occasion,single_typo,True,how to spell occasion,True,0.0,0.0
94
+ dentist appointements available,dentist appointments available,single_typo,True,dentist appointments available,True,0.0,0.0
95
+ nearest pharmacy open now,nearest pharmacy open now,single_typo,False,nearest pharmacy open now,True,0.0,0.0
96
+ beginner gardening tips,beginner gardening tips,single_typo,False,beginner gardening tips,True,0.0,0.0
97
+ calorie counter app,calorie counter app,single_typo,False,calorie counter app,True,0.0,0.0
98
+ what is the defintion of serendipity,what is the definition of serendipity,single_typo,True,what is the definition of serendipity,True,0.0,0.0
99
+ best electrician in town,best electrician in town,single_typo,False,best electrician in town,True,0.0,0.0
100
+ humidty levels today,humidity levels today,single_typo,True,humidity levels today,True,0.0,0.0
101
+ directions to airport,directions to airport,single_typo,False,directions to airport,True,0.0,0.0
102
+ how to spel definitely,how to spell definitely,single_typo,True,how to spel definitely,False,0.043478260869565216,0.25
103
+ grocery stores near me,grocery stores near me,single_typo,False,grocery stores near me,True,0.0,0.0
104
+ buisness hours for target,business hours for target,single_typo,True,business hours for target,True,0.0,0.0
105
+ weather in chicago tommorow,weather in chicago tomorrow,single_typo,True,weather in chicago tomorrow,True,0.0,0.0
106
+ how to make omlette,how to make omelette,single_typo,True,how to make omelette,True,0.0,0.0
107
+ atm machine locations,atm machine locations,single_typo,False,atm machine locations,True,0.0,0.0
108
+ barber shop avaialble,barber shop available,single_typo,True,barber shop available,True,0.0,0.0
109
+ best restauarant in boston,best restaurant in boston,single_typo,True,best restaurant in boston,True,0.0,0.0
110
+ how to cook brocoli,how to cook broccoli,single_typo,True,how to cook broccoli,True,0.0,0.0
111
+ swimming pools near me,swimming pools near me,single_typo,False,swimming pools near me,True,0.0,0.0
112
+ seperate the documents,separate the documents,single_typo,True,separate the documents,True,0.0,0.0
113
+ temperture in fahrenheit,temperature in fahrenheit,single_typo,True,temperature in fahrenheit,True,0.0,0.0
114
+ parking garage nearby,parking garage nearby,single_typo,False,parking garage nearby,True,0.0,0.0
115
+ how to make lasagna recepie,how to make lasagna recipe,single_typo,True,how to make lasagna receive,False,0.07407407407407407,0.2
116
+ veterinary clinic hours,veterinary clinic hours,single_typo,False,veterinary clinic hours,True,0.0,0.0
117
+ what does recieve mean,what does receive mean,single_typo,True,what does receive mean,True,0.0,0.0
118
+ yoga classes availible now,yoga classes available now,single_typo,True,yoga classes available now,True,0.0,0.0
119
+ begininng spanish lessons,beginning spanish lessons,single_typo,True,beginning spanish lessons,True,0.0,0.0
120
+ plumber near me emergancy,plumber near me emergency,multi_typo,True,plumber near me emergency,True,0.0,0.0
121
+ dentist appoitment availble,dentist appointment available,multi_typo,True,dentist appointment available,True,0.0,0.0
122
+ electritian repaire servises,electrician repair services,multi_typo,True,electrician repair services,True,0.0,0.0
123
+ weather forcast this weakend,weather forecast this weekend,multi_typo,True,weather forecast this weekend,True,0.0,0.0
124
+ neerest gass station,nearest gas station,multi_typo,True,nearest gass station,False,0.05,0.3333333333333333
125
+ hotel reservation cheep rates,hotel reservation cheap rates,multi_typo,True,hotel reservation cheep rates,False,0.034482758620689655,0.25
126
+ autombile mechanic lokation,automobile mechanic location,multi_typo,True,automobile mechanic location,True,0.0,0.0
127
+ humidty forcast tomorow,humidity forecast tomorrow,multi_typo,True,humidity forecast tomorrow,True,0.0,0.0
128
+ locksmith emergancy servise,locksmith emergency service,multi_typo,True,locksmith emergency service,True,0.0,0.0
129
+ flight ticket prices comparision,flight ticket prices comparison,multi_typo,True,flight ticket prices comparison,True,0.0,0.0
130
+ restauant reservaton opem,restaurant reservation open,multi_typo,True,restaurant reservation opem,False,0.037037037037037035,0.3333333333333333
131
+ carpentor contrator estimat,carpenter contractor estimate,multi_typo,True,carpenter contractor estimate,True,0.0,0.0
132
+ tempreture alert wheather,temperature alert weather,multi_typo,True,temperature alert whether,False,0.08,0.3333333333333333
133
+ tourist atraction guidebook,tourist attraction guidebook,multi_typo,True,tourist attraction guidebook,True,0.0,0.0
134
+ laundry servise neerby,laundry service nearby,multi_typo,True,laundry service nearby,True,0.0,0.0
135
+ vehcile registation renewel,vehicle registration renewal,multi_typo,True,vehicle registration renewed,False,0.07142857142857142,0.3333333333333333
136
+ snowstrom warning forcast,snowstorm warning forecast,multi_typo,True,snowstorm warning forecast,True,0.0,0.0
137
+ hostotel accomodation deals,hostel accommodation deals,multi_typo,True,hostel accommodation deals,True,0.0,0.0
138
+ haircut appoitment schedul,haircut appointment schedule,multi_typo,True,haircut appointment schedule,True,0.0,0.0
139
+ sunrise time locaton,sunrise time location,multi_typo,True,sunrise time location,True,0.0,0.0
140
+ moving compeny quoate,moving company quote,multi_typo,True,moving company quote,True,0.0,0.0
141
+ road conidtion trafic update,road condition traffic update,multi_typo,True,road condition traffic update,True,0.0,0.0
142
+ veterinarian emergancy clinc,veterinarian emergency clinic,multi_typo,True,veterinarian emergency clinic,True,0.0,0.0
143
+ vacation packge discunt availble,vacation package discount available,multi_typo,True,vacation package discount available,True,0.0,0.0
144
+ pest contral servise lokation,pest control service location,multi_typo,True,pest control service location,True,0.0,0.0
145
+ pollin forcast alergy,pollen forecast allergy,multi_typo,True,pollen forecast allergy,True,0.0,0.0
146
+ airbnb accomodaton recomendation,airbnb accommodation recommendation,multi_typo,True,airing accommodation recommendation,False,0.05714285714285714,0.3333333333333333
147
+ window cleening compny rates,window cleaning company rates,multi_typo,True,window cleaning company rates,True,0.0,0.0
148
+ wind gust wheather alert,wind gust weather alert,multi_typo,True,wind gust whether alert,False,0.08695652173913043,0.25
149
+ rentral car comparision price,rental car comparison price,multi_typo,True,central car comparison price,False,0.07142857142857142,0.25
150
+ goggle.com,google.com,brand_typo,True,goggle com,False,0.2,2.0
151
+ amazn.com,amazon.com,brand_typo,True,amazon com,False,0.1,2.0
152
+ spotfiy music,spotify music,brand_typo,True,spotty music,False,0.15384615384615385,0.5
153
+ instgram app,instagram app,brand_typo,True,ingram app,False,0.23076923076923078,0.5
154
+ gitub profile,github profile,brand_typo,True,tub profile,False,0.21428571428571427,0.5
155
+ redditt.com,reddit.com,brand_typo,True,reddish com,False,0.2727272727272727,2.0
156
+ twiter feed,twitter feed,brand_typo,True,twitter feed,True,0.0,0.0
157
+ linkdin jobs,linkedin jobs,brand_typo,True,linking jobs,False,0.23076923076923078,0.5
158
+ microsodt office,microsoft office,brand_typo,True,microsoft office,True,0.0,0.0
159
+ adoobe creative,adobe creative,brand_typo,True,adobe creative,True,0.0,0.0
160
+ dropbx files,dropbox files,brand_typo,True,drop files,False,0.23076923076923078,0.5
161
+ zom meeting,zoom meeting,brand_typo,True,zom meeting,False,0.08333333333333333,0.5
162
+ slck workspace,slack workspace,brand_typo,True,slck workspace,False,0.06666666666666667,0.5
163
+ paypa checkout,paypal checkout,brand_typo,True,papa checkout,False,0.13333333333333333,0.5
164
+ ebya auction,ebay auction,brand_typo,True,ebya auction,False,0.16666666666666666,0.5
165
+ wallmart groceries,walmart groceries,brand_typo,True,walmart groceries,True,0.0,0.0
166
+ targat deals,target deals,brand_typo,True,target deals,True,0.0,0.0
167
+ nytimez news,nytimes news,brand_typo,True,anytime news,False,0.16666666666666666,0.5
168
+ bbc.co.uk,bbc.com,brand_typo,True,bic co us,False,0.5555555555555556,3.0
169
+ cnn breaking,cnn breaking news,brand_typo,True,cnn breaking,False,0.29411764705882354,0.3333333333333333
170
+ youtub video,youtube video,brand_typo,True,youtube video,True,0.0,0.0
171
+ netflic series,netflix series,brand_typo,True,netflix series,True,0.0,0.0
172
+ googl drive,google drive,brand_typo,True,GOOGL,False,0.5833333333333334,1.0
173
+ amzon shopping,amazon shopping,brand_typo,True,amazon shopping,True,0.0,0.0
174
+ spotiffy playlist,spotify playlist,brand_typo,True,spiffy playlets,False,0.375,1.0
175
+ facebk messenger,facebook messenger,brand_typo,True,face messenger,False,0.2222222222222222,0.5
176
+ insta stories,instagram stories,brand_typo,True,instar stories,False,0.17647058823529413,0.5
177
+ gihub code,github code,brand_typo,True,hub code,False,0.2727272727272727,0.5
178
+ reddot forum,reddit forum,brand_typo,True,redo forum,False,0.25,0.5
179
+ BA 287,BA287,flight_order,True,BA287,True,0.0,0.0
180
+ 502 DL,DL502,flight_order,True,DL502,True,0.0,0.0
181
+ flight UA 441,UA441,flight_order,True,UA441,True,0.0,0.0
182
+ lh 156,LH156,flight_order,True,LH156,True,0.0,0.0
183
+ 273 AF,AF273,flight_order,True,AF273,True,0.0,0.0
184
+ EK 89,EK89,flight_order,True,EK89,True,0.0,0.0
185
+ 621qr,QR621,flight_order,True,QR621,True,0.0,0.0
186
+ CX 884,CX884,flight_order,True,CX884,True,0.0,0.0
187
+ 345 vs,VS345,flight_order,True,VS345,True,0.0,0.0
188
+ KL 714,KL714,flight_order,True,KL714,True,0.0,0.0
189
+ 193ib,IB193,flight_order,True,IB193,True,0.0,0.0
190
+ TK 427,TK427,flight_order,True,TK427,True,0.0,0.0
191
+ flight 556 AA,AA556,flight_order,True,AA556,True,0.0,0.0
192
+ 738 ba,BA738,flight_order,True,BA738,True,0.0,0.0
193
+ DL 212,DL212,flight_order,True,DL212,True,0.0,0.0
194
+ 84ua,UA84,flight_order,True,UA84,True,0.0,0.0
195
+ AF 609,AF609,flight_order,True,AF609,True,0.0,0.0
196
+ 445 ek,EK445,flight_order,True,EK445,True,0.0,0.0
197
+ sq 267,SQ267,flight_order,True,SQ267,True,0.0,0.0
198
+ 572 CX,CX572,flight_order,True,CX572,True,0.0,0.0
199
+ flight vs314,VS314,flight_order,True,flight vs314,False,0.5833333333333334,1.0
200
+ 981 KL,KL981,flight_order,True,KL981,True,0.0,0.0
201
+ IB 456,IB456,flight_order,True,IB456,True,0.0,0.0
202
+ tk 103,TK103,flight_order,True,TK103,True,0.0,0.0
203
+ 890 SQ,SQ890,flight_order,True,SQ890,True,0.0,0.0
204
+ 13 air macbook,macbook air 13,product_order,True,macbook 13 air,False,0.42857142857142855,0.6666666666666666
205
+ pro iphone 15,iphone 15 pro,product_order,True,iphone pro 15,False,0.46153846153846156,0.6666666666666666
206
+ 8 pixel google,google pixel 8,product_order,True,pixel 8 google,False,0.8571428571428571,0.6666666666666666
207
+ ultra 24 s23 samsung,samsung s23 ultra,product_order,True,samsung ultra 24 s23,False,0.55,1.0
208
+ fold 5 samsung galaxy,samsung galaxy fold 5,product_order,True,samsung fold 5 galaxy,False,0.47619047619047616,0.5
209
+ max 14 pro iphone,iphone 14 pro max,product_order,True,iphone max 14 pro,False,0.47058823529411764,0.5
210
+ 16 macbook pro,macbook pro 16,product_order,True,macbook 16 pro,False,0.42857142857142855,0.6666666666666666
211
+ pixel pro 7 google,google pixel 7 pro,product_order,True,pixel pro 7 google,False,0.7777777777777778,0.75
212
+ tab s9 samsung galaxy,samsung galaxy tab s9,product_order,True,samsung tab s9 galaxy,False,0.47619047619047616,0.5
213
+ 12 mini iphone,iphone 12 mini,product_order,True,iphone 12 mini,True,0.0,0.0
214
+ z fold 4 samsung,samsung galaxy z fold 4,product_order,True,samsung z fold 4,False,0.30434782608695654,0.2
215
+ watch series 9 apple,apple watch series 9,product_order,True,watch series 9 apple,False,0.6,0.5
216
+ xl pixel 8 google,google pixel 8 xl,product_order,True,pixel xl 8 google,False,0.7647058823529411,0.75
217
+ s24 ultra samsung,samsung s24 ultra,product_order,True,samsung s24 ultra,True,0.0,0.0
218
+ 15 macbook air,macbook air 15,product_order,True,macbook 15 air,False,0.42857142857142855,0.6666666666666666
219
+ iphone pro 13,iphone 13 pro,product_order,True,iphone pro 13,False,0.46153846153846156,0.6666666666666666
220
+ flip 5 z samsung,samsung galaxy z flip 5,product_order,True,samsung flip 5 z,False,0.4782608695652174,0.6
221
+ 7 series watch apple,apple watch series 7,product_order,True,7 series watch apple,False,0.7,1.0
222
+ a15 oneplus,oneplus a15,product_order,True,a15 onerous,False,0.9090909090909091,1.0
223
+ pad air 11 ipad,ipad air 11,product_order,True,ipad pad air 11,False,0.26666666666666666,0.3333333333333333
224
+ ultra 15 iphone pro,iphone 15 pro max,product_order,True,iphone ultra 15 pro,False,0.5263157894736842,0.5
225
+ note 24 galaxy samsung,samsung galaxy note 24,product_order,True,samsung note 24 galaxy,False,0.6363636363636364,0.5
226
+ 11 pro max iphone,iphone 11 pro max,product_order,True,iphone 11 pro max,True,0.0,0.0
227
+ studio display apple,apple studio display,product_order,True,studio display apple,False,0.6,0.6666666666666666
228
+ x1 carbon lenovo thinkpad,lenovo thinkpad x1 carbon,product_order,True,x1 carbon lenore thinkpad,False,0.88,1.0
229
+ AAPL stock price,AAPL,stock_canon,True,AAPL,True,0.0,0.0
230
+ tesla share price,TSLA,stock_canon,True,TSLA,True,0.0,0.0
231
+ MSFT earnings,MSFT,stock_canon,True,MSFT,True,0.0,0.0
232
+ google GOOGL stock,GOOGL,stock_canon,True,GOOGL,True,0.0,0.0
233
+ AMZN share,AMZN,stock_canon,True,AMZN,True,0.0,0.0
234
+ amazon price AMZN,AMZN,stock_canon,True,AMZN,True,0.0,0.0
235
+ META stock price,META,stock_canon,True,META,True,0.0,0.0
236
+ nvidia NVDA,NVDA,stock_canon,True,NVDA,True,0.0,0.0
237
+ NFLX share price,NFLX,stock_canon,True,NFLX,True,0.0,0.0
238
+ netflix stock NFLX,NFLX,stock_canon,True,NFLX,True,0.0,0.0
239
+ PYPL price,PYPL,stock_canon,True,PYPL,True,0.0,0.0
240
+ paypal PYPL stock,PYPL,stock_canon,True,PYPL,True,0.0,0.0
241
+ SNAP stock,SNAP,stock_canon,True,SNAP,True,0.0,0.0
242
+ snapchat SNAP,SNAP,stock_canon,True,SNAP,True,0.0,0.0
243
+ AMD share price,AMD,stock_canon,True,AMD,True,0.0,0.0
244
+ amd processor stock,AMD,stock_canon,True,AMD,True,0.0,0.0
245
+ INTC earnings,INTC,stock_canon,True,INTC,True,0.0,0.0
246
+ intel INTC stock,INTC,stock_canon,True,INTC,True,0.0,0.0
247
+ QCOM price,QCOM,stock_canon,True,QCOM price,False,0.6,1.0
248
+ qualcomm QCOM,QCOM,stock_canon,True,qualcomm QCOM,False,0.6923076923076923,1.0
249
+ UBER stock price,UBER,stock_canon,True,UBER,True,0.0,0.0
250
+ lyft LYFT share,LYFT,stock_canon,True,LYFT,True,0.0,0.0
251
+ airbnb ABNB stock,ABNB,stock_canon,True,ABNB,True,0.0,0.0
252
+ iphone15pro,iphone 15 pro,spacing,True,iphone pro,False,0.23076923076923078,0.3333333333333333
253
+ samsungz9,samsung z9,spacing,True,samsung,False,0.3,0.5
254
+ ipadair,ipad air,spacing,True,ipad air,True,0.0,0.0
255
+ losangeles,los angeles,spacing,True,los angeles,True,0.0,0.0
256
+ sanfrancisco,san francisco,spacing,True,san francisco,True,0.0,0.0
257
+ nearbyshops,nearby shops,spacing,True,nearby shops,True,0.0,0.0
258
+ dellxps13,dell xps 13,spacing,True,dell see,False,0.45454545454545453,0.6666666666666666
259
+ surfacelaptopp5,surface laptop p5,spacing,True,surface laptop,False,0.17647058823529413,0.3333333333333333
260
+ newyorkpizza,new york pizza,spacing,True,new pizza,False,0.35714285714285715,0.3333333333333333
261
+ holmescompany,holmes company,spacing,True,holmes company,True,0.0,0.0
262
+ findme,find me,spacing,True,find me,True,0.0,0.0
263
+ bostonma,boston ma,spacing,True,boston a,False,0.1111111111111111,0.5
264
+ sanjose,san jose,spacing,True,san jose,True,0.0,0.0
265
+ pixelwatch2,pixel watch 2,spacing,True,pixel watch,False,0.15384615384615385,0.3333333333333333
266
+ openpizza,open pizza,spacing,True,open pizza,True,0.0,0.0
267
+ northcarolina,north carolina,spacing,True,north carolina,True,0.0,0.0
268
+ showevenear,show venues near,spacing,True,showed near,False,0.375,0.6666666666666666
269
+ galax30series,galax 30 series,spacing,True,galaxy series,False,0.2,0.6666666666666666
270
+ laptop,laptop,spacing,False,laptop,True,0.0,0.0
271
+ smartphone,smartphone,spacing,False,smartphone,True,0.0,0.0
272
+ keyboard,keyboard,spacing,False,keyboard,True,0.0,0.0
273
+ monitor,monitor,spacing,False,monitor,True,0.0,0.0
274
+ NYC,NYC,no_change,False,NYC,True,0.0,0.0
275
+ LA,LA,no_change,False,LA,True,0.0,0.0
276
+ UK,UK,no_change,False,UK,True,0.0,0.0
277
+ vue,vue,no_change,False,vue,True,0.0,0.0
278
+ aws,aws,no_change,False,aws,True,0.0,0.0
279
+ sql,sql,no_change,False,sql,True,0.0,0.0
280
+ git,git,no_change,False,git,True,0.0,0.0
281
+ c,c,no_change,False,c,True,0.0,0.0
282
+ x,x,no_change,False,x,True,0.0,0.0
283
+ r,r,no_change,False,r,True,0.0,0.0
284
+ z,z,no_change,False,z,True,0.0,0.0
285
+ kafka,kafka,no_change,False,kafka,True,0.0,0.0
286
+ nginx,nginx,no_change,False,nine,False,0.4,1.0
287
+ vim,vim,no_change,False,vim,True,0.0,0.0
288
+ pdf,pdf,no_change,False,pdf,True,0.0,0.0
289
+ xml,xml,no_change,False,xml,True,0.0,0.0
290
+ svg,svg,no_change,False,svg,True,0.0,0.0
291
+ gcp,gcp,no_change,False,gcp,True,0.0,0.0
292
+ cli,cli,no_change,False,cli,True,0.0,0.0
293
+ api,api,no_change,False,api,True,0.0,0.0
294
+ jwt,jwt,no_change,False,jwt,True,0.0,0.0
295
+ mvp,mvp,no_change,False,mvp,True,0.0,0.0
296
+ gdpr,gdpr,no_change,False,gdpr,True,0.0,0.0
297
+ crm,crm,no_change,False,crm,True,0.0,0.0
298
+ ux,ux,no_change,False,ux,True,0.0,0.0
299
+ pwa,pwa,no_change,False,pwa,True,0.0,0.0
300
+ orm,orm,no_change,False,orm,True,0.0,0.0