Spaces:
Running
Running
query normalization
Browse files- app.py +305 -0
- benchmark.py +820 -0
- dataset.csv +300 -0
- requirements.txt +5 -2
- results.csv +300 -0
app.py
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import streamlit as st
|
| 6 |
+
|
| 7 |
+
st.set_page_config(
|
| 8 |
+
page_title="Firefox Query Normalizer",
|
| 9 |
+
page_icon="🔍",
|
| 10 |
+
layout="wide",
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
HERE = Path(__file__).parent
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# ─── Normalizer (loaded once, cached across reruns) ───────────────────────────
|
| 17 |
+
|
| 18 |
+
@st.cache_resource(show_spinner="Loading normalizer…")
|
| 19 |
+
def load_normalizer():
|
| 20 |
+
from benchmark import CombinedV2Normalizer # noqa: PLC0415
|
| 21 |
+
return CombinedV2Normalizer()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ─── Data ─────────────────────────────────────────────────────────────────────
|
| 25 |
+
|
| 26 |
+
@st.cache_data
|
| 27 |
+
def load_data() -> pd.DataFrame:
|
| 28 |
+
df = pd.read_csv(HERE / "results.csv")
|
| 29 |
+
df["should_change"] = df["should_change"].astype(bool)
|
| 30 |
+
df["em"] = df["em"].astype(bool)
|
| 31 |
+
df["outcome"] = df.apply(_classify_outcome, axis=1)
|
| 32 |
+
return df
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _classify_outcome(row) -> str:
|
| 36 |
+
if row["should_change"]:
|
| 37 |
+
if row["em"]:
|
| 38 |
+
return "✅ Fixed correctly"
|
| 39 |
+
elif str(row["pred"]).strip().lower() == str(row["noisy"]).strip().lower():
|
| 40 |
+
return "❌ Not fixed"
|
| 41 |
+
else:
|
| 42 |
+
return "⚠️ Fixed incorrectly"
|
| 43 |
+
else:
|
| 44 |
+
return "✅ Left unchanged" if row["em"] else "❌ Over-corrected"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
CATEGORY_INFO: dict[str, tuple[str, str]] = {
|
| 48 |
+
"single_typo": ("✏️ Single Typo", "One misspelled word (e.g. 'wheather' → 'weather')"),
|
| 49 |
+
"multi_typo": ("✏️ Multi Typo", "Two or more typos in the same query"),
|
| 50 |
+
"brand_typo": ("🏷️ Brand Typo", "Brand name misspelled (e.g. 'bestbuyt' → 'best buy')"),
|
| 51 |
+
"flight_order": ("✈️ Flight Order", "Flight number tokens reordered (e.g. '163 SQ' → 'SQ163')"),
|
| 52 |
+
"product_order": ("📱 Product Order", "Product tokens reordered (e.g. '15 iphone' → 'iphone 15')"),
|
| 53 |
+
"stock_canon": ("📈 Stock Ticker", "Stock query → ticker only (e.g. 'AAPL stock' → 'AAPL')"),
|
| 54 |
+
"spacing": ("⎵ Spacing", "Missing spaces fixed (e.g. 'nearme' → 'near me')"),
|
| 55 |
+
"no_change": ("🔒 No Change", "Should not be modified — tests over-correction resistance"),
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
OUTCOME_ORDER = [
|
| 59 |
+
"✅ Fixed correctly",
|
| 60 |
+
"✅ Left unchanged",
|
| 61 |
+
"❌ Not fixed",
|
| 62 |
+
"⚠️ Fixed incorrectly",
|
| 63 |
+
"❌ Over-corrected",
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ─── Header ───────────────────────────────────────────────────────────────────
|
| 68 |
+
|
| 69 |
+
st.title("🔍 Query Normalizer")
|
| 70 |
+
st.caption("**CombinedV2** pipeline · Preprocessing stage for Merino intent classification")
|
| 71 |
+
|
| 72 |
+
with st.expander("ℹ️ What is this and why does it matter?", expanded=False):
|
| 73 |
+
st.markdown("""
|
| 74 |
+
Intent detection tries to classify user queries by intents —
|
| 75 |
+
navigational, local, commercial, etc. — to surface the right suggestions.
|
| 76 |
+
Real queries are noisy: users make typos, omit spaces, or enter tokens in the
|
| 77 |
+
wrong order.
|
| 78 |
+
|
| 79 |
+
**CombinedV2** is a lightweight rule + dictionary normalizer that runs in **< 1 ms**
|
| 80 |
+
per query. It runs 4 steps in sequence and short-circuits as soon as a fix is made:
|
| 81 |
+
|
| 82 |
+
| Step | What it handles | Example |
|
| 83 |
+
|------|----------------|---------|
|
| 84 |
+
| **1 · Rules** | Flight IDs, stock tickers, product token reordering | `163 SQ` → `SQ163` |
|
| 85 |
+
| **2 · RapidFuzz** | Fuzzy brand matching (single-token only) | `bestbuyt` → `best buy` |
|
| 86 |
+
| **3 · SymSpell** | Concatenated word splitting | `nearme` → `near me` |
|
| 87 |
+
| **4 · GuardedPySpell** | Spell correction (skips ≤4-char tokens & ALL_CAPS) | `wheather nyc` → `weather nyc` |
|
| 88 |
+
|
| 89 |
+
**Benchmark results across 299 queries in 8 categories:**
|
| 90 |
+
|
| 91 |
+
| Metric | Score |
|
| 92 |
+
|--------|-------|
|
| 93 |
+
| Exact match on queries that need fixing | **73.2%** |
|
| 94 |
+
| Precision on queries that should NOT change | **98.5%** |
|
| 95 |
+
| Median latency (p50) | **0.03 ms** |
|
| 96 |
+
""")
|
| 97 |
+
|
| 98 |
+
st.divider()
|
| 99 |
+
|
| 100 |
+
# ─── Tabs ─────────────────────────────────────────────────────────────────────
|
| 101 |
+
|
| 102 |
+
tab_try, tab_browse, tab_perf = st.tabs(["🔤 Try It", "📋 Browse Examples", "📊 Performance"])
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 106 |
+
# TAB 1 — Try It
|
| 107 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 108 |
+
|
| 109 |
+
with tab_try:
|
| 110 |
+
norm = load_normalizer()
|
| 111 |
+
df = load_data()
|
| 112 |
+
|
| 113 |
+
# ── Free-form input (prominent) ───────────────────────────────────
|
| 114 |
+
st.subheader("Type a query to normalize")
|
| 115 |
+
st.caption("Try typos, missing spaces, scrambled product names, flight numbers, stock tickers…")
|
| 116 |
+
|
| 117 |
+
user_query = st.text_input(
|
| 118 |
+
"Query input",
|
| 119 |
+
placeholder="e.g. wheather nyc · 163 SQ · bestbuyt · nearme · 15 iphone · AAPL stock",
|
| 120 |
+
label_visibility="collapsed",
|
| 121 |
+
key="user_query",
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if user_query.strip():
|
| 125 |
+
t0 = time.perf_counter()
|
| 126 |
+
result = norm.normalize(user_query.strip())
|
| 127 |
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
| 128 |
+
|
| 129 |
+
res_col, meta_col = st.columns([3, 1])
|
| 130 |
+
with res_col:
|
| 131 |
+
if result.lower() == user_query.strip().lower():
|
| 132 |
+
st.success(f"**`{user_query.strip()}`** → no change needed → **`{result}`**")
|
| 133 |
+
else:
|
| 134 |
+
st.info(f"**`{user_query.strip()}`** → **`{result}`**")
|
| 135 |
+
|
| 136 |
+
with meta_col:
|
| 137 |
+
st.metric("Latency", f"{elapsed_ms:.2f} ms")
|
| 138 |
+
|
| 139 |
+
# Check if it's in the benchmark dataset
|
| 140 |
+
match = df[df["noisy"].str.lower() == user_query.strip().lower()]
|
| 141 |
+
if len(match):
|
| 142 |
+
row = match.iloc[0]
|
| 143 |
+
cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0]
|
| 144 |
+
if result == row["canonical"]:
|
| 145 |
+
note = f"✅ Matches expected output `{row['canonical']}`"
|
| 146 |
+
else:
|
| 147 |
+
note = f"Expected `{row['canonical']}` · benchmark outcome: **{row['outcome']}**"
|
| 148 |
+
st.caption(f"_Found in benchmark · {cat_label} · {note}_")
|
| 149 |
+
|
| 150 |
+
st.divider()
|
| 151 |
+
|
| 152 |
+
# ── Example picker ────────────────────────────────────────────────
|
| 153 |
+
st.subheader("Or pick an example from the benchmark")
|
| 154 |
+
|
| 155 |
+
pick_col1, pick_col2 = st.columns(2)
|
| 156 |
+
with pick_col1:
|
| 157 |
+
cat_pick = st.selectbox(
|
| 158 |
+
"Category",
|
| 159 |
+
["All"] + list(CATEGORY_INFO.keys()),
|
| 160 |
+
format_func=lambda k: "All categories" if k == "All" else CATEGORY_INFO[k][0],
|
| 161 |
+
key="cat_pick",
|
| 162 |
+
)
|
| 163 |
+
with pick_col2:
|
| 164 |
+
show_errors_only = st.checkbox("Errors / failures only", value=False)
|
| 165 |
+
|
| 166 |
+
sub = df if cat_pick == "All" else df[df["category"] == cat_pick]
|
| 167 |
+
if show_errors_only:
|
| 168 |
+
sub = sub[~sub["em"]]
|
| 169 |
+
|
| 170 |
+
if len(sub) == 0:
|
| 171 |
+
st.info("No examples match these filters.")
|
| 172 |
+
else:
|
| 173 |
+
example_labels = [
|
| 174 |
+
f"{row.noisy} [{CATEGORY_INFO.get(row.category, (row.category,''))[0]}]"
|
| 175 |
+
for row in sub.itertuples()
|
| 176 |
+
]
|
| 177 |
+
picked_label = st.selectbox("Example", example_labels, key="example_pick")
|
| 178 |
+
picked_noisy = picked_label.split(" [")[0]
|
| 179 |
+
row = sub[sub["noisy"] == picked_noisy].iloc[0]
|
| 180 |
+
|
| 181 |
+
ex_left, ex_right = st.columns([3, 1])
|
| 182 |
+
with ex_left:
|
| 183 |
+
t0 = time.perf_counter()
|
| 184 |
+
ex_result = norm.normalize(picked_noisy)
|
| 185 |
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
| 186 |
+
|
| 187 |
+
st.markdown(f"**Input:** `{picked_noisy}`")
|
| 188 |
+
st.markdown(f"**Expected:** `{row['canonical']}`")
|
| 189 |
+
|
| 190 |
+
if ex_result == row["canonical"]:
|
| 191 |
+
st.success(f"**Got:** `{ex_result}` ✅")
|
| 192 |
+
elif ex_result.lower() == picked_noisy.lower():
|
| 193 |
+
st.error(f"**Got:** `{ex_result}` — normalizer didn't fix it")
|
| 194 |
+
else:
|
| 195 |
+
st.warning(f"**Got:** `{ex_result}` — expected `{row['canonical']}`")
|
| 196 |
+
|
| 197 |
+
with ex_right:
|
| 198 |
+
st.metric("Latency", f"{elapsed_ms:.2f} ms")
|
| 199 |
+
cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0]
|
| 200 |
+
st.caption(cat_label)
|
| 201 |
+
st.caption(CATEGORY_INFO.get(row["category"], ("", row["category"]))[1])
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 205 |
+
# TAB 2 — Browse Examples
|
| 206 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 207 |
+
|
| 208 |
+
with tab_browse:
|
| 209 |
+
df = load_data()
|
| 210 |
+
|
| 211 |
+
f1, f2 = st.columns(2)
|
| 212 |
+
with f1:
|
| 213 |
+
cats = st.multiselect(
|
| 214 |
+
"Categories",
|
| 215 |
+
options=list(CATEGORY_INFO.keys()),
|
| 216 |
+
default=list(CATEGORY_INFO.keys()),
|
| 217 |
+
format_func=lambda k: CATEGORY_INFO[k][0],
|
| 218 |
+
)
|
| 219 |
+
with f2:
|
| 220 |
+
outcomes = st.multiselect(
|
| 221 |
+
"Outcomes",
|
| 222 |
+
options=OUTCOME_ORDER,
|
| 223 |
+
default=OUTCOME_ORDER,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
filtered = df[df["category"].isin(cats) & df["outcome"].isin(outcomes)]
|
| 227 |
+
st.caption(f"Showing **{len(filtered)}** of {len(df)} examples")
|
| 228 |
+
|
| 229 |
+
display = filtered[["noisy", "pred", "canonical", "category", "outcome"]].copy()
|
| 230 |
+
display.columns = ["Input (noisy)", "Predicted", "Expected", "Category", "Outcome"]
|
| 231 |
+
display["Category"] = display["Category"].map(
|
| 232 |
+
lambda k: CATEGORY_INFO.get(k, (k, ""))[0]
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
st.dataframe(
|
| 236 |
+
display,
|
| 237 |
+
use_container_width=True,
|
| 238 |
+
hide_index=True,
|
| 239 |
+
height=540,
|
| 240 |
+
column_config={
|
| 241 |
+
"Input (noisy)": st.column_config.TextColumn(width="medium"),
|
| 242 |
+
"Predicted": st.column_config.TextColumn(width="medium"),
|
| 243 |
+
"Expected": st.column_config.TextColumn(width="medium"),
|
| 244 |
+
"Category": st.column_config.TextColumn(width="medium"),
|
| 245 |
+
"Outcome": st.column_config.TextColumn(width="small"),
|
| 246 |
+
},
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 251 |
+
# TAB 3 — Performance
|
| 252 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 253 |
+
|
| 254 |
+
with tab_perf:
|
| 255 |
+
df = load_data()
|
| 256 |
+
|
| 257 |
+
needs_change = df[df["should_change"]]
|
| 258 |
+
no_change = df[~df["should_change"]]
|
| 259 |
+
|
| 260 |
+
c1, c2, c3, c4 = st.columns(4)
|
| 261 |
+
c1.metric("Total examples", f"{len(df)}")
|
| 262 |
+
c2.metric("Overall EM", f"{df['em'].mean():.1%}")
|
| 263 |
+
c3.metric("Fix accuracy", f"{needs_change['em'].mean():.1%}",
|
| 264 |
+
help="Exact match on queries that SHOULD change")
|
| 265 |
+
c4.metric("No-change precision", f"{no_change['em'].mean():.1%}",
|
| 266 |
+
help="Correctly left unchanged queries that should NOT change")
|
| 267 |
+
|
| 268 |
+
st.markdown("---")
|
| 269 |
+
st.subheader("Per-category breakdown")
|
| 270 |
+
|
| 271 |
+
rows = []
|
| 272 |
+
for cat, (label, desc) in CATEGORY_INFO.items():
|
| 273 |
+
sub = df[df["category"] == cat]
|
| 274 |
+
if len(sub) == 0:
|
| 275 |
+
continue
|
| 276 |
+
needs = sub[sub["should_change"]]
|
| 277 |
+
ok = sub[~sub["should_change"]]
|
| 278 |
+
rows.append({
|
| 279 |
+
"Category": label,
|
| 280 |
+
"n": len(sub),
|
| 281 |
+
"EM %": f"{sub['em'].mean():.0%}",
|
| 282 |
+
"Fix accuracy": f"{needs['em'].mean():.0%}" if len(needs) else "—",
|
| 283 |
+
"No-change prec.": f"{ok['em'].mean():.0%}" if len(ok) else "—",
|
| 284 |
+
"Errors": int((~sub["em"]).sum()),
|
| 285 |
+
"What it tests": desc,
|
| 286 |
+
})
|
| 287 |
+
|
| 288 |
+
st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
|
| 289 |
+
|
| 290 |
+
st.markdown("---")
|
| 291 |
+
st.subheader("Failure cases by category")
|
| 292 |
+
st.caption("All queries where the normalizer produced a wrong output.")
|
| 293 |
+
|
| 294 |
+
failures = df[~df["em"]]
|
| 295 |
+
if len(failures) == 0:
|
| 296 |
+
st.success("No failures!")
|
| 297 |
+
else:
|
| 298 |
+
for cat, (label, _) in CATEGORY_INFO.items():
|
| 299 |
+
sub = failures[failures["category"] == cat]
|
| 300 |
+
if len(sub) == 0:
|
| 301 |
+
continue
|
| 302 |
+
with st.expander(f"{label} — {len(sub)} failure{'s' if len(sub) != 1 else ''}"):
|
| 303 |
+
show = sub[["noisy", "pred", "canonical", "outcome"]].copy()
|
| 304 |
+
show.columns = ["Input", "Predicted", "Expected", "Outcome"]
|
| 305 |
+
st.dataframe(show, use_container_width=True, hide_index=True)
|
benchmark.py
ADDED
|
@@ -0,0 +1,820 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Query Normalization Benchmark
|
| 3 |
+
==============================
|
| 4 |
+
Benchmarks multiple normalization approaches on the generated dataset.
|
| 5 |
+
|
| 6 |
+
Normalizers:
|
| 7 |
+
1. Identity - baseline, no change
|
| 8 |
+
2. PySpellChecker - token-by-token spell correction (current approach)
|
| 9 |
+
3. SymSpell - faster, supports compound word correction
|
| 10 |
+
4. Rules - regex + entity canonicalization (flight IDs, stock tickers, product spacing)
|
| 11 |
+
5. RapidFuzz - fuzzy brand name matching
|
| 12 |
+
6. Combined - Rules → SymSpell → RapidFuzz pipeline
|
| 13 |
+
--- ML ---
|
| 14 |
+
7. ContextualSpellCheck - spaCy pipeline with BERT contextual embeddings
|
| 15 |
+
8. T5SpellCorrector - HuggingFace T5 fine-tuned for spelling correction
|
| 16 |
+
9. CombinedML - Rules → T5 pipeline (entity rules first, T5 for the rest)
|
| 17 |
+
|
| 18 |
+
Metrics (per normalizer, per category):
|
| 19 |
+
exact_match - % where output == canonical (case-insensitive)
|
| 20 |
+
cer - character error rate: edit_dist / max(len_pred, len_gold)
|
| 21 |
+
wer - word error rate: token-level edit distance / n_gold_tokens
|
| 22 |
+
no_change_precision - on no_change rows: % correctly left unchanged
|
| 23 |
+
over_correction - on no_change rows: % wrongly changed
|
| 24 |
+
latency_mean_ms - mean per-query latency
|
| 25 |
+
latency_p50_ms - p50 latency
|
| 26 |
+
latency_p95_ms - p95 latency
|
| 27 |
+
latency_p99_ms - p99 latency
|
| 28 |
+
|
| 29 |
+
Usage:
|
| 30 |
+
pip install -r requirements.txt
|
| 31 |
+
python3 benchmark.py [--dataset dataset.csv]
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
import re
|
| 35 |
+
import sys
|
| 36 |
+
import time
|
| 37 |
+
import argparse
|
| 38 |
+
import warnings
|
| 39 |
+
import numpy as np
|
| 40 |
+
import pandas as pd
|
| 41 |
+
from pathlib import Path
|
| 42 |
+
from abc import ABC, abstractmethod
|
| 43 |
+
from typing import Optional
|
| 44 |
+
|
| 45 |
+
warnings.filterwarnings("ignore")
|
| 46 |
+
|
| 47 |
+
# ── Optional imports ───────────────────────────────────────────────────────────
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
from Levenshtein import distance as _lev
|
| 51 |
+
def edit_distance(a: str, b: str) -> int: return _lev(a, b)
|
| 52 |
+
except ImportError:
|
| 53 |
+
# Pure-python fallback
|
| 54 |
+
def edit_distance(a: str, b: str) -> int:
|
| 55 |
+
m, n = len(a), len(b)
|
| 56 |
+
dp = list(range(n + 1))
|
| 57 |
+
for i in range(1, m + 1):
|
| 58 |
+
prev = dp[:]
|
| 59 |
+
dp[0] = i
|
| 60 |
+
for j in range(1, n + 1):
|
| 61 |
+
dp[j] = prev[j - 1] if a[i-1] == b[j-1] else 1 + min(prev[j], dp[j-1], prev[j-1])
|
| 62 |
+
return dp[n]
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
from spellchecker import SpellChecker as _SC
|
| 66 |
+
HAS_PYSPELL = True
|
| 67 |
+
except ImportError:
|
| 68 |
+
HAS_PYSPELL = False
|
| 69 |
+
print("Warning: pyspellchecker not installed — skipping PySpell normalizer")
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
from symspellpy import SymSpell as _SS, Verbosity as _V
|
| 73 |
+
import pkg_resources
|
| 74 |
+
HAS_SYMSPELL = True
|
| 75 |
+
except ImportError:
|
| 76 |
+
HAS_SYMSPELL = False
|
| 77 |
+
print("Warning: symspellpy not installed — skipping SymSpell normalizer")
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
from rapidfuzz import process as _rf_process, fuzz as _rf_fuzz
|
| 81 |
+
HAS_RAPIDFUZZ = True
|
| 82 |
+
except ImportError:
|
| 83 |
+
HAS_RAPIDFUZZ = False
|
| 84 |
+
print("Warning: rapidfuzz not installed — skipping RapidFuzz normalizer")
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
import spacy as _spacy
|
| 88 |
+
import contextualSpellCheck as _csc
|
| 89 |
+
_csc_nlp = _spacy.load("en_core_web_sm")
|
| 90 |
+
_csc.add_to_pipe(_csc_nlp)
|
| 91 |
+
HAS_CONTEXTUAL = True
|
| 92 |
+
except Exception:
|
| 93 |
+
HAS_CONTEXTUAL = False
|
| 94 |
+
print("Warning: contextualSpellCheck/spacy not available — skipping ContextualSpellCheck normalizer")
|
| 95 |
+
print(" Install: pip install contextualSpellCheck && python -m spacy download en_core_web_sm")
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
from transformers import pipeline as _hf_pipeline
|
| 99 |
+
HAS_TRANSFORMERS = True
|
| 100 |
+
except ImportError:
|
| 101 |
+
HAS_TRANSFORMERS = False
|
| 102 |
+
print("Warning: transformers not installed — skipping T5 normalizer")
|
| 103 |
+
print(" Install: pip install transformers torch")
|
| 104 |
+
|
| 105 |
+
# ── Brand list for fuzzy matching ──────────────────────────────────────────────
|
| 106 |
+
|
| 107 |
+
BRANDS = [
|
| 108 |
+
"amazon", "google", "facebook", "twitter", "instagram", "youtube",
|
| 109 |
+
"linkedin", "reddit", "netflix", "spotify", "microsoft", "adobe",
|
| 110 |
+
"dropbox", "github", "slack", "zoom", "paypal", "ebay", "walmart",
|
| 111 |
+
"target", "best buy", "new york times", "bbc", "cnn", "espn",
|
| 112 |
+
"gmail", "outlook", "yahoo", "apple", "samsung", "dell", "hp",
|
| 113 |
+
"lenovo", "asus", "acer", "toshiba", "sony", "lg", "panasonic",
|
| 114 |
+
"booking.com", "expedia", "airbnb", "tripadvisor", "yelp",
|
| 115 |
+
"doordash", "ubereats", "grubhub", "lyft", "uber",
|
| 116 |
+
"twitch", "discord", "telegram", "whatsapp", "snapchat", "tiktok",
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
# ── Entity lists for rules normalizer ──────────────────────────────────────────
|
| 120 |
+
|
| 121 |
+
# Common IATA codes (2-3 letter airline codes)
|
| 122 |
+
IATA_CODES = {
|
| 123 |
+
"AA", "BA", "DL", "UA", "LH", "AF", "EK", "QR", "SQ", "CX",
|
| 124 |
+
"VS", "KL", "IB", "TK", "AC", "QF", "NH", "JL", "MH", "TG",
|
| 125 |
+
"AI", "SA", "ET", "KE", "OZ", "CI", "BR", "LA", "AV", "AM",
|
| 126 |
+
"WN", "B6", "AS", "F9", "NK", "G4", "VX", "HA",
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# Common stock tickers → company name aliases
|
| 130 |
+
STOCK_ALIASES: dict[str, list[str]] = {
|
| 131 |
+
"AAPL": ["apple", "aapl"],
|
| 132 |
+
"TSLA": ["tesla", "tsla"],
|
| 133 |
+
"MSFT": ["microsoft", "msft"],
|
| 134 |
+
"GOOGL": ["google", "alphabet", "googl"],
|
| 135 |
+
"AMZN": ["amazon", "amzn"],
|
| 136 |
+
"META": ["meta", "facebook", "fb"],
|
| 137 |
+
"NVDA": ["nvidia", "nvda"],
|
| 138 |
+
"NFLX": ["netflix", "nflx"],
|
| 139 |
+
"PYPL": ["paypal", "pypl"],
|
| 140 |
+
"SNAP": ["snapchat", "snap"],
|
| 141 |
+
"AMD": ["amd"],
|
| 142 |
+
"INTC": ["intel", "intc"],
|
| 143 |
+
"UBER": ["uber"],
|
| 144 |
+
"LYFT": ["lyft"],
|
| 145 |
+
"ABNB": ["airbnb", "abnb"],
|
| 146 |
+
"COIN": ["coinbase", "coin"],
|
| 147 |
+
"HOOD": ["robinhood", "hood"],
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
# Reverse map: alias → ticker
|
| 151 |
+
_ALIAS_TO_TICKER: dict[str, str] = {}
|
| 152 |
+
for ticker, aliases in STOCK_ALIASES.items():
|
| 153 |
+
for alias in aliases:
|
| 154 |
+
_ALIAS_TO_TICKER[alias.lower()] = ticker
|
| 155 |
+
|
| 156 |
+
# Product model patterns: brand → canonical prefix
|
| 157 |
+
PRODUCT_BRANDS = ["iphone", "samsung", "macbook", "ipad", "pixel", "surface"]
|
| 158 |
+
|
| 159 |
+
# ── Base normalizer ────────────────────────────────────────────────────────────
|
| 160 |
+
|
| 161 |
+
class Normalizer(ABC):
|
| 162 |
+
name: str
|
| 163 |
+
|
| 164 |
+
def warmup(self) -> None:
|
| 165 |
+
"""Called once before benchmarking to initialize any lazy state."""
|
| 166 |
+
pass
|
| 167 |
+
|
| 168 |
+
@abstractmethod
|
| 169 |
+
def normalize(self, query: str) -> str:
|
| 170 |
+
...
|
| 171 |
+
|
| 172 |
+
def normalize_batch(self, queries: list[str]) -> list[str]:
|
| 173 |
+
return [self.normalize(q) for q in queries]
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# ── 1. Identity (baseline) ────────────────────────────────────────────────────
|
| 177 |
+
|
| 178 |
+
class IdentityNormalizer(Normalizer):
|
| 179 |
+
name = "Identity (baseline)"
|
| 180 |
+
|
| 181 |
+
def normalize(self, query: str) -> str:
|
| 182 |
+
return query
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# ── 2. PySpellChecker ────────────────────────────────────────────────────────
|
| 186 |
+
|
| 187 |
+
class PySpellNormalizer(Normalizer):
|
| 188 |
+
name = "PySpellChecker"
|
| 189 |
+
|
| 190 |
+
def __init__(self):
|
| 191 |
+
if not HAS_PYSPELL:
|
| 192 |
+
raise RuntimeError("pyspellchecker not installed")
|
| 193 |
+
self._sc = _SC()
|
| 194 |
+
|
| 195 |
+
def normalize(self, query: str) -> str:
|
| 196 |
+
words = query.lower().split()
|
| 197 |
+
return " ".join(self._sc.correction(w) or w for w in words)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# ── 3. SymSpell ───────────────────────────────────────────────────────────────
|
| 201 |
+
|
| 202 |
+
_ORCAS_VOCAB = Path(__file__).parent / "orcas_vocab.txt"
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class SymSpellNormalizer(Normalizer):
|
| 206 |
+
name = "SymSpell"
|
| 207 |
+
|
| 208 |
+
def __init__(self, max_edit_distance: int = 2):
|
| 209 |
+
if not HAS_SYMSPELL:
|
| 210 |
+
raise RuntimeError("symspellpy not installed")
|
| 211 |
+
self._sym = _SS(max_dictionary_edit_distance=max_edit_distance)
|
| 212 |
+
# Try importlib.resources first (works in newer Python/packaging setups),
|
| 213 |
+
# fall back to pkg_resources for older environments.
|
| 214 |
+
_dict_loaded = False
|
| 215 |
+
# Try candidate dictionary filenames (name changed across symspellpy versions)
|
| 216 |
+
_DICT_CANDIDATES = ["frequency_dictionary_en_82_765.txt", "en-80k.txt"]
|
| 217 |
+
try:
|
| 218 |
+
import importlib.resources as _ir
|
| 219 |
+
for _fname in _DICT_CANDIDATES:
|
| 220 |
+
try:
|
| 221 |
+
_ref = _ir.files("symspellpy").joinpath(_fname)
|
| 222 |
+
with _ir.as_file(_ref) as _dp:
|
| 223 |
+
_dict_loaded = self._sym.load_dictionary(str(_dp), term_index=0, count_index=1)
|
| 224 |
+
if _dict_loaded:
|
| 225 |
+
break
|
| 226 |
+
except Exception:
|
| 227 |
+
pass
|
| 228 |
+
except Exception:
|
| 229 |
+
pass
|
| 230 |
+
if not _dict_loaded:
|
| 231 |
+
for _fname in _DICT_CANDIDATES:
|
| 232 |
+
_dp = pkg_resources.resource_filename("symspellpy", _fname)
|
| 233 |
+
_dict_loaded = self._sym.load_dictionary(_dp, term_index=0, count_index=1)
|
| 234 |
+
if _dict_loaded:
|
| 235 |
+
break
|
| 236 |
+
if _ORCAS_VOCAB.exists():
|
| 237 |
+
self._sym.load_dictionary(str(_ORCAS_VOCAB), term_index=0, count_index=1)
|
| 238 |
+
self.name = "SymSpell+ORCAS"
|
| 239 |
+
self._max_ed = max_edit_distance
|
| 240 |
+
|
| 241 |
+
def normalize(self, query: str) -> str:
|
| 242 |
+
# Use lookup_compound for multi-token correction
|
| 243 |
+
suggestions = self._sym.lookup_compound(
|
| 244 |
+
query.lower(), max_edit_distance=self._max_ed
|
| 245 |
+
)
|
| 246 |
+
if suggestions:
|
| 247 |
+
return suggestions[0].term
|
| 248 |
+
return query.lower()
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
# ── 4. Rules (entity + regex) ────────────────────────────────────────────────
|
| 252 |
+
|
| 253 |
+
class RulesNormalizer(Normalizer):
|
| 254 |
+
name = "Rules (entity + regex)"
|
| 255 |
+
|
| 256 |
+
# Flight: digits + IATA or IATA + digits → IATA + digits (no space)
|
| 257 |
+
_FLIGHT_LOOSE = re.compile(
|
| 258 |
+
r'\b(?:flight\s+)?(\d{2,4})\s*([A-Z]{2,3})\b' # 163 SQ
|
| 259 |
+
r'|'
|
| 260 |
+
r'\b(?:flight\s+)?([A-Z]{2,3})\s+(\d{2,4})\b', # SQ 163 (space)
|
| 261 |
+
re.IGNORECASE
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# Product spacing: brand directly followed by digits/variant ("iphone15")
|
| 265 |
+
_PRODUCT_SPACING = re.compile(
|
| 266 |
+
r'\b(iphone|macbook|ipad|pixel|galaxy|surface|airpods)'
|
| 267 |
+
r'(\d+|pro|air|mini|max|ultra|plus)\b',
|
| 268 |
+
re.IGNORECASE
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# Stock: remove surrounding noise, keep just the ticker
|
| 272 |
+
_STOCK_NOISE = re.compile(
|
| 273 |
+
r'\b(stock|share|price|shares|equity|ticker|market|trading|invest(?:ment)?)\b',
|
| 274 |
+
re.IGNORECASE
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
def _normalize_flight(self, query: str) -> str:
|
| 278 |
+
q_upper = query.upper()
|
| 279 |
+
def _repl(m):
|
| 280 |
+
if m.group(1): # digits IATA
|
| 281 |
+
num, code = m.group(1), m.group(2).upper()
|
| 282 |
+
else: # IATA digits
|
| 283 |
+
code, num = m.group(3).upper(), m.group(4)
|
| 284 |
+
if code in IATA_CODES:
|
| 285 |
+
return f"{code}{num}"
|
| 286 |
+
return m.group(0)
|
| 287 |
+
result = self._FLIGHT_LOOSE.sub(_repl, query)
|
| 288 |
+
return result
|
| 289 |
+
|
| 290 |
+
def _normalize_stock(self, query: str) -> Optional[str]:
|
| 291 |
+
ql = query.lower().strip()
|
| 292 |
+
tokens = ql.split()
|
| 293 |
+
# Check if any token is a known ticker or alias
|
| 294 |
+
found_ticker = None
|
| 295 |
+
for tok in tokens:
|
| 296 |
+
# Direct ticker match (uppercase)
|
| 297 |
+
if tok.upper() in STOCK_ALIASES:
|
| 298 |
+
found_ticker = tok.upper()
|
| 299 |
+
break
|
| 300 |
+
# Alias match
|
| 301 |
+
if tok in _ALIAS_TO_TICKER:
|
| 302 |
+
found_ticker = _ALIAS_TO_TICKER[tok]
|
| 303 |
+
if found_ticker:
|
| 304 |
+
# Case 1: stock noise words present (e.g. "AAPL stock price")
|
| 305 |
+
remaining = self._STOCK_NOISE.sub("", ql).strip()
|
| 306 |
+
if remaining != ql.strip():
|
| 307 |
+
return found_ticker
|
| 308 |
+
# Case 2: explicit ticker token present alongside alias
|
| 309 |
+
# (e.g. "apple aapl", "google GOOGL") — but NOT "google pixel 8"
|
| 310 |
+
if found_ticker.lower() in tokens:
|
| 311 |
+
return found_ticker
|
| 312 |
+
return None
|
| 313 |
+
|
| 314 |
+
def _normalize_product_spacing(self, query: str) -> str:
|
| 315 |
+
return self._PRODUCT_SPACING.sub(lambda m: f"{m.group(1)} {m.group(2)}", query)
|
| 316 |
+
|
| 317 |
+
def _normalize_word_order(self, query: str) -> str:
|
| 318 |
+
"""Reorder product queries so the brand/product-line token comes first.
|
| 319 |
+
|
| 320 |
+
Handles patterns like:
|
| 321 |
+
's24 samsung' → 'samsung s24'
|
| 322 |
+
'pro 14 macbook' → 'macbook pro 14'
|
| 323 |
+
'ultra s23 samsung'→ 'samsung ultra s23'
|
| 324 |
+
"""
|
| 325 |
+
tokens = query.lower().split()
|
| 326 |
+
if len(tokens) < 2:
|
| 327 |
+
return query
|
| 328 |
+
# Find a PRODUCT_BRANDS token that is not already at position 0
|
| 329 |
+
for i, tok in enumerate(tokens):
|
| 330 |
+
if i > 0 and tok in PRODUCT_BRANDS:
|
| 331 |
+
# Move brand to front, preserve relative order of the rest
|
| 332 |
+
return " ".join([tok] + tokens[:i] + tokens[i + 1:])
|
| 333 |
+
return query
|
| 334 |
+
|
| 335 |
+
def normalize(self, query: str) -> str:
|
| 336 |
+
q = query.strip()
|
| 337 |
+
|
| 338 |
+
# 1. Stock canonicalization
|
| 339 |
+
stock = self._normalize_stock(q)
|
| 340 |
+
if stock:
|
| 341 |
+
return stock
|
| 342 |
+
|
| 343 |
+
# 2. Flight ID normalization
|
| 344 |
+
q = self._normalize_flight(q)
|
| 345 |
+
|
| 346 |
+
# 3. Product spacing
|
| 347 |
+
q = self._normalize_product_spacing(q)
|
| 348 |
+
|
| 349 |
+
# 4. Product word order
|
| 350 |
+
q = self._normalize_word_order(q)
|
| 351 |
+
|
| 352 |
+
# 5. Clean up extra whitespace
|
| 353 |
+
q = re.sub(r'\s+', ' ', q).strip()
|
| 354 |
+
|
| 355 |
+
return q
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
# ── 5. RapidFuzz (brand matching) ────────────────────────────────────────────
|
| 359 |
+
|
| 360 |
+
class RapidFuzzNormalizer(Normalizer):
|
| 361 |
+
name = "RapidFuzz (brand match)"
|
| 362 |
+
|
| 363 |
+
def __init__(self, score_cutoff: int = 82):
|
| 364 |
+
if not HAS_RAPIDFUZZ:
|
| 365 |
+
raise RuntimeError("rapidfuzz not installed")
|
| 366 |
+
self._cutoff = score_cutoff
|
| 367 |
+
|
| 368 |
+
def normalize(self, query: str) -> str:
|
| 369 |
+
ql = query.lower().strip()
|
| 370 |
+
|
| 371 |
+
# Only attempt brand correction on short queries (≤ 3 tokens)
|
| 372 |
+
tokens = ql.split()
|
| 373 |
+
if len(tokens) > 3:
|
| 374 |
+
return query
|
| 375 |
+
|
| 376 |
+
# Skip very short queries — too ambiguous to fuzzy-match safely
|
| 377 |
+
# (e.g. 'appl', 'npm', 'gcc' should not be matched to brand names)
|
| 378 |
+
if len(ql) <= 5:
|
| 379 |
+
return query
|
| 380 |
+
|
| 381 |
+
# Try matching each n-gram of the query against the brand list
|
| 382 |
+
# First try the full query, then try progressively smaller windows
|
| 383 |
+
result = _rf_process.extractOne(
|
| 384 |
+
ql, BRANDS,
|
| 385 |
+
scorer=_rf_fuzz.token_sort_ratio,
|
| 386 |
+
score_cutoff=self._cutoff,
|
| 387 |
+
)
|
| 388 |
+
if result:
|
| 389 |
+
best_match, score, _ = result
|
| 390 |
+
return best_match
|
| 391 |
+
|
| 392 |
+
return query
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
# ── 6. Combined ───────────────────────────────────────────────────────────────
|
| 396 |
+
|
| 397 |
+
class CombinedNormalizer(Normalizer):
|
| 398 |
+
name = "Combined (Rules + SymSpell + RapidFuzz)"
|
| 399 |
+
|
| 400 |
+
def __init__(self):
|
| 401 |
+
self._rules = RulesNormalizer()
|
| 402 |
+
self._symspell = SymSpellNormalizer() if HAS_SYMSPELL else None
|
| 403 |
+
self._rfuzz = RapidFuzzNormalizer() if HAS_RAPIDFUZZ else None
|
| 404 |
+
|
| 405 |
+
def normalize(self, query: str) -> str:
|
| 406 |
+
q = query.strip()
|
| 407 |
+
|
| 408 |
+
# Step 1: Apply entity/structural rules first (highest precision)
|
| 409 |
+
q_rules = self._rules.normalize(q)
|
| 410 |
+
if q_rules.lower() != q.lower():
|
| 411 |
+
return q_rules # Rules made a change — trust it
|
| 412 |
+
|
| 413 |
+
# Step 2: SymSpell for general typo correction
|
| 414 |
+
if self._symspell:
|
| 415 |
+
q_sym = self._symspell.normalize(q)
|
| 416 |
+
if q_sym.lower() != q.lower():
|
| 417 |
+
return q_sym
|
| 418 |
+
|
| 419 |
+
# Step 3: RapidFuzz for brand name typos (catches what SymSpell misses
|
| 420 |
+
# on compound brand names like "bestbuyt" → "best buy")
|
| 421 |
+
if self._rfuzz:
|
| 422 |
+
q_rf = self._rfuzz.normalize(q)
|
| 423 |
+
if q_rf.lower() != q.lower():
|
| 424 |
+
return q_rf
|
| 425 |
+
|
| 426 |
+
return q
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
# ── 7. GuardedPySpell ────────────────────────────────────────────────────────
|
| 430 |
+
|
| 431 |
+
class GuardedPySpellNormalizer(Normalizer):
|
| 432 |
+
"""PySpellChecker with guards to prevent over-correction.
|
| 433 |
+
|
| 434 |
+
PySpellChecker gets 88% on single_typo and 71% on multi_typo, but has
|
| 435 |
+
40% over-correction on no-change queries (e.g. 'appl' → 'apple').
|
| 436 |
+
|
| 437 |
+
Guards:
|
| 438 |
+
- Skip tokens ≤ 4 chars (appl, npm, gcc, css, java, rust, echo, go)
|
| 439 |
+
- Skip all-uppercase tokens (AAPL, NYC, SQ — abbreviations/tickers)
|
| 440 |
+
|
| 441 |
+
Most legitimate short abbreviations are ≤ 4 chars or all-caps.
|
| 442 |
+
Typos worth correcting are almost always ≥ 5 chars ('wheather', 'suhsi').
|
| 443 |
+
"""
|
| 444 |
+
name = "PySpell (guarded)"
|
| 445 |
+
|
| 446 |
+
def __init__(self):
|
| 447 |
+
if not HAS_PYSPELL:
|
| 448 |
+
raise RuntimeError("pyspellchecker not installed")
|
| 449 |
+
self._sc = _SC()
|
| 450 |
+
|
| 451 |
+
def _skip(self, token: str) -> bool:
|
| 452 |
+
return len(token) <= 4 or token.isupper()
|
| 453 |
+
|
| 454 |
+
def normalize(self, query: str) -> str:
|
| 455 |
+
words = query.lower().split()
|
| 456 |
+
return " ".join(
|
| 457 |
+
w if self._skip(w) else (self._sc.correction(w) or w)
|
| 458 |
+
for w in words
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
# ── 8. CombinedV2 (Rules + GuardedPySpell + RapidFuzz) ───────────────────────
|
| 463 |
+
|
| 464 |
+
class CombinedV2Normalizer(Normalizer):
|
| 465 |
+
"""Improved pipeline: Rules → RapidFuzz (single-token) → SymSpell split → GuardedPySpell → RapidFuzz (multi-token).
|
| 466 |
+
|
| 467 |
+
Rules handles structured entities (flight IDs, stock tickers, product
|
| 468 |
+
spacing/order) with perfect precision. RapidFuzz runs first on single-token
|
| 469 |
+
queries to catch brand typos (bestbuyt→best buy) before SymSpell can corrupt
|
| 470 |
+
them (bestbuyt→best but). SymSpell compound splitting then handles concatenated
|
| 471 |
+
words (nearme→near me). GuardedPySpell handles general typos while protecting
|
| 472 |
+
short tokens. RapidFuzz runs again at the end for multi-token brand typos.
|
| 473 |
+
"""
|
| 474 |
+
name = "CombinedV2 (Rules + GuardedPySpell + RapidFuzz)"
|
| 475 |
+
|
| 476 |
+
def __init__(self):
|
| 477 |
+
self._rules = RulesNormalizer()
|
| 478 |
+
self._symspell = SymSpellNormalizer() if HAS_SYMSPELL else None
|
| 479 |
+
self._pyspell = GuardedPySpellNormalizer() if HAS_PYSPELL else None
|
| 480 |
+
self._rfuzz = RapidFuzzNormalizer() if HAS_RAPIDFUZZ else None
|
| 481 |
+
|
| 482 |
+
def normalize(self, query: str) -> str:
|
| 483 |
+
q = query.strip()
|
| 484 |
+
|
| 485 |
+
# Step 1: Rules — flight IDs, stock tickers, product spacing/order
|
| 486 |
+
q_rules = self._rules.normalize(q)
|
| 487 |
+
if q_rules.lower() != q.lower():
|
| 488 |
+
return q_rules
|
| 489 |
+
|
| 490 |
+
# Step 2: RapidFuzz — brand name typos for single-token queries.
|
| 491 |
+
# Must run before SymSpell compound splitting: SymSpell splits 'bestbuyt'
|
| 492 |
+
# into 'best but' (wrong) whereas RapidFuzz correctly maps it to 'best buy'.
|
| 493 |
+
if self._rfuzz and ' ' not in q:
|
| 494 |
+
q_rf = self._rfuzz.normalize(q)
|
| 495 |
+
if q_rf.lower() != q.lower():
|
| 496 |
+
return q_rf
|
| 497 |
+
|
| 498 |
+
# Step 3: SymSpell compound splitting for single-token queries only.
|
| 499 |
+
# GuardedPySpell would corrupt 'nearme'→'name', 'newyork'→'network'.
|
| 500 |
+
# Only accept the SymSpell result if it actually introduces a space
|
| 501 |
+
# (i.e. it split the word rather than substituting a different word).
|
| 502 |
+
if self._symspell and ' ' not in q:
|
| 503 |
+
q_sym = self._symspell.normalize(q)
|
| 504 |
+
if ' ' in q_sym:
|
| 505 |
+
return q_sym
|
| 506 |
+
|
| 507 |
+
# Step 4: GuardedPySpell — general typos (skips short/uppercase tokens)
|
| 508 |
+
if self._pyspell:
|
| 509 |
+
q_spell = self._pyspell.normalize(q)
|
| 510 |
+
if q_spell.lower() != q.lower():
|
| 511 |
+
return q_spell
|
| 512 |
+
|
| 513 |
+
# Step 5: RapidFuzz — brand name typos for multi-token queries
|
| 514 |
+
# (e.g. 'gooogle maps' → 'google maps', 'spotifiy premium' → 'spotify premium')
|
| 515 |
+
if self._rfuzz:
|
| 516 |
+
q_rf = self._rfuzz.normalize(q)
|
| 517 |
+
if q_rf.lower() != q.lower():
|
| 518 |
+
return q_rf
|
| 519 |
+
|
| 520 |
+
return q
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
# ── 9. ContextualSpellCheck (spaCy + BERT) ───────────────────────────────────
|
| 524 |
+
|
| 525 |
+
class ContextualSpellCheckNormalizer(Normalizer):
|
| 526 |
+
"""Uses BERT contextual embeddings to decide whether and how to correct
|
| 527 |
+
each token. Unlike SymSpell, it sees the full query context before
|
| 528 |
+
making a correction — so 'appl' in an ambiguous context stays as-is,
|
| 529 |
+
while 'wheather nyc' correctly becomes 'weather nyc'.
|
| 530 |
+
|
| 531 |
+
Requires:
|
| 532 |
+
pip install contextualSpellCheck
|
| 533 |
+
python -m spacy download en_core_web_sm
|
| 534 |
+
"""
|
| 535 |
+
name = "ContextualSpellCheck (BERT)"
|
| 536 |
+
|
| 537 |
+
def __init__(self):
|
| 538 |
+
if not HAS_CONTEXTUAL:
|
| 539 |
+
raise RuntimeError("contextualSpellCheck not available")
|
| 540 |
+
self._nlp = _csc_nlp
|
| 541 |
+
|
| 542 |
+
def normalize(self, query: str) -> str:
|
| 543 |
+
doc = self._nlp(query)
|
| 544 |
+
# doc._.outcome_spellCheck is the full corrected string
|
| 545 |
+
result = doc._.outcome_spellCheck
|
| 546 |
+
return result if result else query
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
# ── 8. T5 Spell Corrector (HuggingFace) ──────────────────────────────────────
|
| 550 |
+
|
| 551 |
+
class T5SpellCorrector(Normalizer):
|
| 552 |
+
"""Fine-tuned T5 model for spelling correction.
|
| 553 |
+
Model: oliverguhr/spelling-correction-english-base
|
| 554 |
+
|
| 555 |
+
This is a seq2seq model trained on noisy→clean sentence pairs.
|
| 556 |
+
It handles multi-token typos, word order, and spacing better than
|
| 557 |
+
dictionary-based approaches, but at significantly higher latency.
|
| 558 |
+
|
| 559 |
+
Expected latency: ~100–500ms on CPU, ~20–80ms on GPU.
|
| 560 |
+
|
| 561 |
+
Requires:
|
| 562 |
+
pip install transformers torch (or transformers sentencepiece)
|
| 563 |
+
"""
|
| 564 |
+
name = "T5 (oliverguhr/spelling-correction)"
|
| 565 |
+
|
| 566 |
+
_MODEL_ID = "oliverguhr/spelling-correction-english-base"
|
| 567 |
+
|
| 568 |
+
def __init__(self):
|
| 569 |
+
if not HAS_TRANSFORMERS:
|
| 570 |
+
raise RuntimeError("transformers not installed")
|
| 571 |
+
self._pipe = None # lazy load in warmup()
|
| 572 |
+
|
| 573 |
+
def warmup(self) -> None:
|
| 574 |
+
print(f" Loading {self._MODEL_ID}...", end=" ", flush=True)
|
| 575 |
+
self._pipe = _hf_pipeline(
|
| 576 |
+
"text2text-generation",
|
| 577 |
+
model=self._MODEL_ID,
|
| 578 |
+
tokenizer=self._MODEL_ID,
|
| 579 |
+
)
|
| 580 |
+
# Prime the model with a dummy query
|
| 581 |
+
self._pipe("warmup query", max_length=64)
|
| 582 |
+
print("ready")
|
| 583 |
+
|
| 584 |
+
def normalize(self, query: str) -> str:
|
| 585 |
+
if self._pipe is None:
|
| 586 |
+
self.warmup()
|
| 587 |
+
result = self._pipe(query, max_length=128, num_beams=4)
|
| 588 |
+
return result[0]["generated_text"].strip()
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
# ── 9. CombinedML (Rules → T5) ───────────────────────────────────────────────
|
| 592 |
+
|
| 593 |
+
class CombinedMLNormalizer(Normalizer):
|
| 594 |
+
"""Best-of-both-worlds pipeline:
|
| 595 |
+
1. Rules handle structured entity normalization (flight IDs, stock tickers,
|
| 596 |
+
product model reordering) with zero latency and perfect precision.
|
| 597 |
+
2. T5 handles everything else — general typos, multi-token corrections,
|
| 598 |
+
brand names — using full-query context.
|
| 599 |
+
|
| 600 |
+
This avoids running T5 on queries that rules already handle perfectly,
|
| 601 |
+
saving latency on the most common structured patterns.
|
| 602 |
+
"""
|
| 603 |
+
name = "CombinedML (Rules → T5)"
|
| 604 |
+
|
| 605 |
+
def __init__(self):
|
| 606 |
+
self._rules = RulesNormalizer()
|
| 607 |
+
self._t5 = T5SpellCorrector() if HAS_TRANSFORMERS else None
|
| 608 |
+
|
| 609 |
+
def warmup(self) -> None:
|
| 610 |
+
if self._t5:
|
| 611 |
+
self._t5.warmup()
|
| 612 |
+
|
| 613 |
+
def normalize(self, query: str) -> str:
|
| 614 |
+
# Step 1: Rules first — highest precision for structured entities
|
| 615 |
+
q_rules = self._rules.normalize(query)
|
| 616 |
+
if q_rules.lower() != query.lower():
|
| 617 |
+
return q_rules
|
| 618 |
+
|
| 619 |
+
# Step 2: T5 for everything else
|
| 620 |
+
if self._t5:
|
| 621 |
+
return self._t5.normalize(query)
|
| 622 |
+
|
| 623 |
+
return query
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
# ── Metrics ───────────────────────────────────────────────────────────────────
|
| 627 |
+
|
| 628 |
+
def char_error_rate(pred: str, gold: str) -> float:
|
| 629 |
+
"""CER = edit_distance / max(len(pred), len(gold))."""
|
| 630 |
+
if not pred and not gold:
|
| 631 |
+
return 0.0
|
| 632 |
+
return edit_distance(pred.lower(), gold.lower()) / max(len(pred), len(gold))
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
def word_error_rate(pred: str, gold: str) -> float:
|
| 636 |
+
"""WER = token-level edit distance / number of gold tokens."""
|
| 637 |
+
pred_toks = pred.lower().split()
|
| 638 |
+
gold_toks = gold.lower().split()
|
| 639 |
+
if not gold_toks:
|
| 640 |
+
return 0.0
|
| 641 |
+
m, n = len(pred_toks), len(gold_toks)
|
| 642 |
+
dp = list(range(n + 1))
|
| 643 |
+
for i in range(1, m + 1):
|
| 644 |
+
prev = dp[:]
|
| 645 |
+
dp[0] = i
|
| 646 |
+
for j in range(1, n + 1):
|
| 647 |
+
dp[j] = prev[j-1] if pred_toks[i-1] == gold_toks[j-1] \
|
| 648 |
+
else 1 + min(prev[j], dp[j-1], prev[j-1])
|
| 649 |
+
return dp[n] / n
|
| 650 |
+
|
| 651 |
+
|
| 652 |
+
def run_benchmark(normalizer: Normalizer, df: pd.DataFrame, n_timing_reps: int = 5) -> dict:
|
| 653 |
+
"""Run a normalizer on the dataset and return metrics."""
|
| 654 |
+
queries = df["noisy"].tolist()
|
| 655 |
+
|
| 656 |
+
# ── Timing ───────────────────────────────────────────────────────────────
|
| 657 |
+
latencies_ms = []
|
| 658 |
+
for q in queries:
|
| 659 |
+
t0 = time.perf_counter()
|
| 660 |
+
for _ in range(n_timing_reps):
|
| 661 |
+
normalizer.normalize(q)
|
| 662 |
+
t1 = time.perf_counter()
|
| 663 |
+
latencies_ms.append((t1 - t0) / n_timing_reps * 1000)
|
| 664 |
+
|
| 665 |
+
# ── Predictions ──────────────────────────────────────────────────────────
|
| 666 |
+
preds = [normalizer.normalize(q) for q in queries]
|
| 667 |
+
df = df.copy()
|
| 668 |
+
df["pred"] = preds
|
| 669 |
+
|
| 670 |
+
def em(row): return row["pred"].lower().strip() == row["canonical"].lower().strip()
|
| 671 |
+
def cer(row): return char_error_rate(row["pred"], row["canonical"])
|
| 672 |
+
def wer(row): return word_error_rate(row["pred"], row["canonical"])
|
| 673 |
+
|
| 674 |
+
df["em"] = df.apply(em, axis=1)
|
| 675 |
+
df["cer"] = df.apply(cer, axis=1)
|
| 676 |
+
df["wer"] = df.apply(wer, axis=1)
|
| 677 |
+
|
| 678 |
+
# No-change precision and over-correction rate
|
| 679 |
+
nc = df[~df["should_change"]]
|
| 680 |
+
no_change_precision = (nc["pred"].str.lower().str.strip() == nc["noisy"].str.lower().str.strip()).mean() if len(nc) else float("nan")
|
| 681 |
+
over_correction = 1.0 - no_change_precision if not np.isnan(no_change_precision) else float("nan")
|
| 682 |
+
|
| 683 |
+
# ── Per-category exact match ──────────────────────────────────────────────
|
| 684 |
+
cat_em = df.groupby("category")["em"].mean().to_dict()
|
| 685 |
+
|
| 686 |
+
return {
|
| 687 |
+
"name": normalizer.name,
|
| 688 |
+
"exact_match": df["em"].mean(),
|
| 689 |
+
"cer_mean": df["cer"].mean(),
|
| 690 |
+
"wer_mean": df["wer"].mean(),
|
| 691 |
+
"no_change_precision": no_change_precision,
|
| 692 |
+
"over_correction": over_correction,
|
| 693 |
+
"latency_mean_ms": np.mean(latencies_ms),
|
| 694 |
+
"latency_p50_ms": np.percentile(latencies_ms, 50),
|
| 695 |
+
"latency_p95_ms": np.percentile(latencies_ms, 95),
|
| 696 |
+
"latency_p99_ms": np.percentile(latencies_ms, 99),
|
| 697 |
+
"per_category": cat_em,
|
| 698 |
+
"_df": df, # store for detailed output
|
| 699 |
+
"_latencies": latencies_ms,
|
| 700 |
+
}
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
# ── Main ──────────────────────────────────────────────────────────────────────
|
| 704 |
+
|
| 705 |
+
def main():
|
| 706 |
+
parser = argparse.ArgumentParser()
|
| 707 |
+
parser.add_argument("--dataset", default=str(Path(__file__).parent / "dataset.csv"))
|
| 708 |
+
parser.add_argument("--reps", type=int, default=5, help="Timing repetitions per query")
|
| 709 |
+
args = parser.parse_args()
|
| 710 |
+
|
| 711 |
+
df = pd.read_csv(args.dataset)
|
| 712 |
+
print(f"Loaded {len(df)} rows from {args.dataset}")
|
| 713 |
+
print(f"Categories: {df['category'].value_counts().to_dict()}\n")
|
| 714 |
+
|
| 715 |
+
# ── Build normalizer list ─────────────────────────────────────────────────
|
| 716 |
+
normalizers: list[Normalizer] = [IdentityNormalizer(), RulesNormalizer()]
|
| 717 |
+
if HAS_PYSPELL:
|
| 718 |
+
normalizers.append(PySpellNormalizer())
|
| 719 |
+
if HAS_SYMSPELL:
|
| 720 |
+
normalizers.append(SymSpellNormalizer())
|
| 721 |
+
if HAS_RAPIDFUZZ:
|
| 722 |
+
normalizers.append(RapidFuzzNormalizer())
|
| 723 |
+
if HAS_SYMSPELL and HAS_RAPIDFUZZ:
|
| 724 |
+
normalizers.append(CombinedNormalizer())
|
| 725 |
+
if HAS_PYSPELL:
|
| 726 |
+
normalizers.append(GuardedPySpellNormalizer())
|
| 727 |
+
if HAS_PYSPELL and HAS_RAPIDFUZZ:
|
| 728 |
+
normalizers.append(CombinedV2Normalizer())
|
| 729 |
+
# ML normalizers (disabled — too slow and underperform rules-based)
|
| 730 |
+
# if HAS_CONTEXTUAL:
|
| 731 |
+
# normalizers.append(ContextualSpellCheckNormalizer())
|
| 732 |
+
# if HAS_TRANSFORMERS:
|
| 733 |
+
# normalizers.append(T5SpellCorrector())
|
| 734 |
+
# normalizers.append(CombinedMLNormalizer())
|
| 735 |
+
|
| 736 |
+
# Warmup
|
| 737 |
+
for norm in normalizers:
|
| 738 |
+
norm.warmup()
|
| 739 |
+
|
| 740 |
+
# ── Run benchmarks ────────────────────────────────────────────────────────
|
| 741 |
+
results = []
|
| 742 |
+
for norm in normalizers:
|
| 743 |
+
print(f"Benchmarking: {norm.name}...", end=" ", flush=True)
|
| 744 |
+
r = run_benchmark(norm, df, n_timing_reps=args.reps)
|
| 745 |
+
results.append(r)
|
| 746 |
+
print(f"EM={r['exact_match']:.1%} CER={r['cer_mean']:.3f} lat_p50={r['latency_p50_ms']:.2f}ms")
|
| 747 |
+
|
| 748 |
+
# ── Summary table ─────────────────────────────────────────────────────────
|
| 749 |
+
print("\n" + "="*90)
|
| 750 |
+
print("SUMMARY — Overall Metrics")
|
| 751 |
+
print("="*90)
|
| 752 |
+
|
| 753 |
+
summary_rows = []
|
| 754 |
+
for r in results:
|
| 755 |
+
summary_rows.append({
|
| 756 |
+
"Normalizer": r["name"],
|
| 757 |
+
"Exact Match": f"{r['exact_match']:.1%}",
|
| 758 |
+
"CER": f"{r['cer_mean']:.3f}",
|
| 759 |
+
"WER": f"{r['wer_mean']:.3f}",
|
| 760 |
+
"No-change Prec.": f"{r['no_change_precision']:.1%}" if not np.isnan(r['no_change_precision']) else "N/A",
|
| 761 |
+
"Over-correction": f"{r['over_correction']:.1%}" if not np.isnan(r['over_correction']) else "N/A",
|
| 762 |
+
"Lat mean (ms)": f"{r['latency_mean_ms']:.2f}",
|
| 763 |
+
"Lat p50 (ms)": f"{r['latency_p50_ms']:.2f}",
|
| 764 |
+
"Lat p95 (ms)": f"{r['latency_p95_ms']:.2f}",
|
| 765 |
+
"Lat p99 (ms)": f"{r['latency_p99_ms']:.2f}",
|
| 766 |
+
})
|
| 767 |
+
|
| 768 |
+
try:
|
| 769 |
+
from tabulate import tabulate
|
| 770 |
+
print(tabulate(summary_rows, headers="keys", tablefmt="rounded_outline"))
|
| 771 |
+
except ImportError:
|
| 772 |
+
pd.DataFrame(summary_rows).to_string(index=False)
|
| 773 |
+
print(pd.DataFrame(summary_rows).to_string(index=False))
|
| 774 |
+
|
| 775 |
+
# ── Per-category table ────────────────────────────────────────────────────
|
| 776 |
+
categories = sorted(df["category"].unique())
|
| 777 |
+
print("\n" + "="*90)
|
| 778 |
+
print("PER-CATEGORY Exact Match")
|
| 779 |
+
print("="*90)
|
| 780 |
+
|
| 781 |
+
cat_rows = []
|
| 782 |
+
for r in results:
|
| 783 |
+
row = {"Normalizer": r["name"][:30]}
|
| 784 |
+
for cat in categories:
|
| 785 |
+
row[cat] = f"{r['per_category'].get(cat, float('nan')):.0%}"
|
| 786 |
+
cat_rows.append(row)
|
| 787 |
+
|
| 788 |
+
try:
|
| 789 |
+
from tabulate import tabulate
|
| 790 |
+
print(tabulate(cat_rows, headers="keys", tablefmt="rounded_outline"))
|
| 791 |
+
except ImportError:
|
| 792 |
+
print(pd.DataFrame(cat_rows).to_string(index=False))
|
| 793 |
+
|
| 794 |
+
# ── Sample predictions ────────────────────────────────────────────────────
|
| 795 |
+
print("\n" + "="*90)
|
| 796 |
+
print("SAMPLE PREDICTIONS — Combined vs Identity (first 5 per category)")
|
| 797 |
+
print("="*90)
|
| 798 |
+
|
| 799 |
+
combined_r = next((r for r in results if "CombinedV2" in r["name"]),
|
| 800 |
+
next((r for r in results if "Combined" in r["name"]), results[-1]))
|
| 801 |
+
identity_r = results[0]
|
| 802 |
+
|
| 803 |
+
for cat in categories:
|
| 804 |
+
sub = combined_r["_df"][combined_r["_df"]["category"] == cat].head(5)
|
| 805 |
+
id_sub = identity_r["_df"][identity_r["_df"]["category"] == cat].head(5)
|
| 806 |
+
print(f"\n {cat.upper()}")
|
| 807 |
+
print(f" {'Noisy':<30} {'Canonical':<25} {'Combined pred':<25} {'EM':>4}")
|
| 808 |
+
print(f" {'-'*30} {'-'*25} {'-'*25} {'-'*4}")
|
| 809 |
+
for (_, row), (_, id_row) in zip(sub.iterrows(), id_sub.iterrows()):
|
| 810 |
+
em_mark = "✓" if row["em"] else "✗"
|
| 811 |
+
print(f" {row['noisy']:<30} {row['canonical']:<25} {row['pred']:<25} {em_mark:>4}")
|
| 812 |
+
|
| 813 |
+
# ── Save full results ─────────────────────────────────────────────────────
|
| 814 |
+
out_path = Path(args.dataset).parent / "results.csv"
|
| 815 |
+
combined_r["_df"].to_csv(out_path, index=False)
|
| 816 |
+
print(f"\nFull predictions saved to {out_path}")
|
| 817 |
+
|
| 818 |
+
|
| 819 |
+
if __name__ == "__main__":
|
| 820 |
+
main()
|
dataset.csv
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
noisy,canonical,category,should_change
|
| 2 |
+
wheather nyc,weather nyc,single_typo,True
|
| 3 |
+
calclator,calculator,single_typo,True
|
| 4 |
+
forcast london,forecast london,single_typo,True
|
| 5 |
+
temprature converter,temperature converter,single_typo,True
|
| 6 |
+
restaurent near me,restaurant near me,single_typo,True
|
| 7 |
+
translater english,translator english,single_typo,True
|
| 8 |
+
defintion of entropy,definition of entropy,single_typo,True
|
| 9 |
+
seperate the words,separate the words,single_typo,True
|
| 10 |
+
accomodation paris,accommodation paris,single_typo,True
|
| 11 |
+
recieve email,receive email,single_typo,True
|
| 12 |
+
suhsi near me,sushi near me,multi_typo,True
|
| 13 |
+
restarant near me,restaurant near me,multi_typo,True
|
| 14 |
+
wether forcast today,weather forecast today,multi_typo,True
|
| 15 |
+
plmber emergancy,plumber emergency,multi_typo,True
|
| 16 |
+
nearist cofee shop,nearest coffee shop,multi_typo,True
|
| 17 |
+
cheep flihgts paris,cheap flights paris,multi_typo,True
|
| 18 |
+
hotl delas nyc,hotel deals nyc,multi_typo,True
|
| 19 |
+
hosptial emergancy rm,hospital emergency room,multi_typo,True
|
| 20 |
+
bestbuyt,best buy,brand_typo,True
|
| 21 |
+
youtueb,youtube,brand_typo,True
|
| 22 |
+
gooogle maps,google maps,brand_typo,True
|
| 23 |
+
amazom prime,amazon prime,brand_typo,True
|
| 24 |
+
netflx login,netflix login,brand_typo,True
|
| 25 |
+
spotifiy premium,spotify premium,brand_typo,True
|
| 26 |
+
facbook login,facebook login,brand_typo,True
|
| 27 |
+
instagrem,instagram,brand_typo,True
|
| 28 |
+
linkdin profile,linkedin profile,brand_typo,True
|
| 29 |
+
gitub repo,github repo,brand_typo,True
|
| 30 |
+
163 SQ,SQ163,flight_order,True
|
| 31 |
+
100 AA,AA100,flight_order,True
|
| 32 |
+
417 BA,BA417,flight_order,True
|
| 33 |
+
SQ 163,SQ163,flight_order,True
|
| 34 |
+
AA 100,AA100,flight_order,True
|
| 35 |
+
815 DL,DL815,flight_order,True
|
| 36 |
+
200 UA,UA200,flight_order,True
|
| 37 |
+
flight 163 SQ,SQ163,flight_order,True
|
| 38 |
+
AA flight 100,AA100,flight_order,True
|
| 39 |
+
15 iphone,iphone 15,product_order,True
|
| 40 |
+
pro 14 macbook,macbook pro 14,product_order,True
|
| 41 |
+
s24 samsung,samsung s24,product_order,True
|
| 42 |
+
ultra s23 samsung,samsung s23 ultra,product_order,True
|
| 43 |
+
air 13 macbook,macbook air 13,product_order,True
|
| 44 |
+
pro ipad 12,ipad pro 12,product_order,True
|
| 45 |
+
max pro 15 iphone,iphone 15 pro max,product_order,True
|
| 46 |
+
pixel 8 google,google pixel 8,product_order,True
|
| 47 |
+
tab s9 samsung,samsung tab s9,product_order,True
|
| 48 |
+
AAPL stock,AAPL,stock_canon,True
|
| 49 |
+
stock TSLA,TSLA,stock_canon,True
|
| 50 |
+
apple aapl,AAPL,stock_canon,True
|
| 51 |
+
tesla stock price,TSLA,stock_canon,True
|
| 52 |
+
MSFT share price,MSFT,stock_canon,True
|
| 53 |
+
google stock GOOGL,GOOGL,stock_canon,True
|
| 54 |
+
amazon AMZN stock,AMZN,stock_canon,True
|
| 55 |
+
meta stock FB,META,stock_canon,True
|
| 56 |
+
nvda share price,NVDA,stock_canon,True
|
| 57 |
+
iphone15,iphone 15,spacing,True
|
| 58 |
+
macbookpro,macbook pro,spacing,True
|
| 59 |
+
nearme,near me,spacing,True
|
| 60 |
+
bestbuy,best buy,spacing,True
|
| 61 |
+
newyork,new york,spacing,True
|
| 62 |
+
unitedstates,united states,spacing,True
|
| 63 |
+
wifi password,wifi password,spacing,False
|
| 64 |
+
hotdog,hotdog,spacing,False
|
| 65 |
+
appl,appl,no_change,False
|
| 66 |
+
rust,rust,no_change,False
|
| 67 |
+
delta,delta,no_change,False
|
| 68 |
+
apple,apple,no_change,False
|
| 69 |
+
python,python,no_change,False
|
| 70 |
+
java,java,no_change,False
|
| 71 |
+
echo,echo,no_change,False
|
| 72 |
+
spring,spring,no_change,False
|
| 73 |
+
cloud,cloud,no_change,False
|
| 74 |
+
mercury,mercury,no_change,False
|
| 75 |
+
npm,npm,no_change,False
|
| 76 |
+
gcc,gcc,no_change,False
|
| 77 |
+
css,css,no_change,False
|
| 78 |
+
go,go,no_change,False
|
| 79 |
+
swift,swift,no_change,False
|
| 80 |
+
waether forecast tomorrow,weather forecast tomorrow,single_typo,True
|
| 81 |
+
best pizzeria near me,best pizzeria near me,single_typo,False
|
| 82 |
+
how to cook pasta,how to cook pasta,single_typo,False
|
| 83 |
+
gas stations nearby,gas stations nearby,single_typo,False
|
| 84 |
+
resturant reservations online,restaurant reservations online,single_typo,True
|
| 85 |
+
puplic libraries near me,public libraries near me,single_typo,True
|
| 86 |
+
best plumber in my area,best plumber in my area,single_typo,False
|
| 87 |
+
forcast weekend weather,forecast weekend weather,single_typo,True
|
| 88 |
+
how to pronounce worcester,how to pronounce worcester,single_typo,False
|
| 89 |
+
recipie for chocolate cake,recipe for chocolate cake,single_typo,True
|
| 90 |
+
hardware store locator,hardware store locator,single_typo,False
|
| 91 |
+
trafic conditions now,traffic conditions now,single_typo,True
|
| 92 |
+
vacuum cleaner reviews,vacuum cleaner reviews,single_typo,False
|
| 93 |
+
how to spell occassion,how to spell occasion,single_typo,True
|
| 94 |
+
dentist appointements available,dentist appointments available,single_typo,True
|
| 95 |
+
nearest pharmacy open now,nearest pharmacy open now,single_typo,False
|
| 96 |
+
beginner gardening tips,beginner gardening tips,single_typo,False
|
| 97 |
+
calorie counter app,calorie counter app,single_typo,False
|
| 98 |
+
what is the defintion of serendipity,what is the definition of serendipity,single_typo,True
|
| 99 |
+
best electrician in town,best electrician in town,single_typo,False
|
| 100 |
+
humidty levels today,humidity levels today,single_typo,True
|
| 101 |
+
directions to airport,directions to airport,single_typo,False
|
| 102 |
+
how to spel definitely,how to spell definitely,single_typo,True
|
| 103 |
+
grocery stores near me,grocery stores near me,single_typo,False
|
| 104 |
+
buisness hours for target,business hours for target,single_typo,True
|
| 105 |
+
weather in chicago tommorow,weather in chicago tomorrow,single_typo,True
|
| 106 |
+
how to make omlette,how to make omelette,single_typo,True
|
| 107 |
+
atm machine locations,atm machine locations,single_typo,False
|
| 108 |
+
barber shop avaialble,barber shop available,single_typo,True
|
| 109 |
+
best restauarant in boston,best restaurant in boston,single_typo,True
|
| 110 |
+
how to cook brocoli,how to cook broccoli,single_typo,True
|
| 111 |
+
swimming pools near me,swimming pools near me,single_typo,False
|
| 112 |
+
seperate the documents,separate the documents,single_typo,True
|
| 113 |
+
temperture in fahrenheit,temperature in fahrenheit,single_typo,True
|
| 114 |
+
parking garage nearby,parking garage nearby,single_typo,False
|
| 115 |
+
how to make lasagna recepie,how to make lasagna recipe,single_typo,True
|
| 116 |
+
veterinary clinic hours,veterinary clinic hours,single_typo,False
|
| 117 |
+
what does recieve mean,what does receive mean,single_typo,True
|
| 118 |
+
yoga classes availible now,yoga classes available now,single_typo,True
|
| 119 |
+
begininng spanish lessons,beginning spanish lessons,single_typo,True
|
| 120 |
+
plumber near me emergancy,plumber near me emergency,multi_typo,True
|
| 121 |
+
dentist appoitment availble,dentist appointment available,multi_typo,True
|
| 122 |
+
electritian repaire servises,electrician repair services,multi_typo,True
|
| 123 |
+
weather forcast this weakend,weather forecast this weekend,multi_typo,True
|
| 124 |
+
neerest gass station,nearest gas station,multi_typo,True
|
| 125 |
+
hotel reservation cheep rates,hotel reservation cheap rates,multi_typo,True
|
| 126 |
+
autombile mechanic lokation,automobile mechanic location,multi_typo,True
|
| 127 |
+
humidty forcast tomorow,humidity forecast tomorrow,multi_typo,True
|
| 128 |
+
locksmith emergancy servise,locksmith emergency service,multi_typo,True
|
| 129 |
+
flight ticket prices comparision,flight ticket prices comparison,multi_typo,True
|
| 130 |
+
restauant reservaton opem,restaurant reservation open,multi_typo,True
|
| 131 |
+
carpentor contrator estimat,carpenter contractor estimate,multi_typo,True
|
| 132 |
+
tempreture alert wheather,temperature alert weather,multi_typo,True
|
| 133 |
+
tourist atraction guidebook,tourist attraction guidebook,multi_typo,True
|
| 134 |
+
laundry servise neerby,laundry service nearby,multi_typo,True
|
| 135 |
+
vehcile registation renewel,vehicle registration renewal,multi_typo,True
|
| 136 |
+
snowstrom warning forcast,snowstorm warning forecast,multi_typo,True
|
| 137 |
+
hostotel accomodation deals,hostel accommodation deals,multi_typo,True
|
| 138 |
+
haircut appoitment schedul,haircut appointment schedule,multi_typo,True
|
| 139 |
+
sunrise time locaton,sunrise time location,multi_typo,True
|
| 140 |
+
moving compeny quoate,moving company quote,multi_typo,True
|
| 141 |
+
road conidtion trafic update,road condition traffic update,multi_typo,True
|
| 142 |
+
veterinarian emergancy clinc,veterinarian emergency clinic,multi_typo,True
|
| 143 |
+
vacation packge discunt availble,vacation package discount available,multi_typo,True
|
| 144 |
+
pest contral servise lokation,pest control service location,multi_typo,True
|
| 145 |
+
pollin forcast alergy,pollen forecast allergy,multi_typo,True
|
| 146 |
+
airbnb accomodaton recomendation,airbnb accommodation recommendation,multi_typo,True
|
| 147 |
+
window cleening compny rates,window cleaning company rates,multi_typo,True
|
| 148 |
+
wind gust wheather alert,wind gust weather alert,multi_typo,True
|
| 149 |
+
rentral car comparision price,rental car comparison price,multi_typo,True
|
| 150 |
+
goggle.com,google.com,brand_typo,True
|
| 151 |
+
amazn.com,amazon.com,brand_typo,True
|
| 152 |
+
spotfiy music,spotify music,brand_typo,True
|
| 153 |
+
instgram app,instagram app,brand_typo,True
|
| 154 |
+
gitub profile,github profile,brand_typo,True
|
| 155 |
+
redditt.com,reddit.com,brand_typo,True
|
| 156 |
+
twiter feed,twitter feed,brand_typo,True
|
| 157 |
+
linkdin jobs,linkedin jobs,brand_typo,True
|
| 158 |
+
microsodt office,microsoft office,brand_typo,True
|
| 159 |
+
adoobe creative,adobe creative,brand_typo,True
|
| 160 |
+
dropbx files,dropbox files,brand_typo,True
|
| 161 |
+
zom meeting,zoom meeting,brand_typo,True
|
| 162 |
+
slck workspace,slack workspace,brand_typo,True
|
| 163 |
+
paypa checkout,paypal checkout,brand_typo,True
|
| 164 |
+
ebya auction,ebay auction,brand_typo,True
|
| 165 |
+
wallmart groceries,walmart groceries,brand_typo,True
|
| 166 |
+
targat deals,target deals,brand_typo,True
|
| 167 |
+
nytimez news,nytimes news,brand_typo,True
|
| 168 |
+
bbc.co.uk,bbc.com,brand_typo,True
|
| 169 |
+
cnn breaking,cnn breaking news,brand_typo,True
|
| 170 |
+
youtub video,youtube video,brand_typo,True
|
| 171 |
+
netflic series,netflix series,brand_typo,True
|
| 172 |
+
googl drive,google drive,brand_typo,True
|
| 173 |
+
amzon shopping,amazon shopping,brand_typo,True
|
| 174 |
+
spotiffy playlist,spotify playlist,brand_typo,True
|
| 175 |
+
facebk messenger,facebook messenger,brand_typo,True
|
| 176 |
+
insta stories,instagram stories,brand_typo,True
|
| 177 |
+
gihub code,github code,brand_typo,True
|
| 178 |
+
reddot forum,reddit forum,brand_typo,True
|
| 179 |
+
BA 287,BA287,flight_order,True
|
| 180 |
+
502 DL,DL502,flight_order,True
|
| 181 |
+
flight UA 441,UA441,flight_order,True
|
| 182 |
+
lh 156,LH156,flight_order,True
|
| 183 |
+
273 AF,AF273,flight_order,True
|
| 184 |
+
EK 89,EK89,flight_order,True
|
| 185 |
+
621qr,QR621,flight_order,True
|
| 186 |
+
CX 884,CX884,flight_order,True
|
| 187 |
+
345 vs,VS345,flight_order,True
|
| 188 |
+
KL 714,KL714,flight_order,True
|
| 189 |
+
193ib,IB193,flight_order,True
|
| 190 |
+
TK 427,TK427,flight_order,True
|
| 191 |
+
flight 556 AA,AA556,flight_order,True
|
| 192 |
+
738 ba,BA738,flight_order,True
|
| 193 |
+
DL 212,DL212,flight_order,True
|
| 194 |
+
84ua,UA84,flight_order,True
|
| 195 |
+
AF 609,AF609,flight_order,True
|
| 196 |
+
445 ek,EK445,flight_order,True
|
| 197 |
+
sq 267,SQ267,flight_order,True
|
| 198 |
+
572 CX,CX572,flight_order,True
|
| 199 |
+
flight vs314,VS314,flight_order,True
|
| 200 |
+
981 KL,KL981,flight_order,True
|
| 201 |
+
IB 456,IB456,flight_order,True
|
| 202 |
+
tk 103,TK103,flight_order,True
|
| 203 |
+
890 SQ,SQ890,flight_order,True
|
| 204 |
+
13 air macbook,macbook air 13,product_order,True
|
| 205 |
+
pro iphone 15,iphone 15 pro,product_order,True
|
| 206 |
+
8 pixel google,google pixel 8,product_order,True
|
| 207 |
+
ultra 24 s23 samsung,samsung s23 ultra,product_order,True
|
| 208 |
+
fold 5 samsung galaxy,samsung galaxy fold 5,product_order,True
|
| 209 |
+
max 14 pro iphone,iphone 14 pro max,product_order,True
|
| 210 |
+
16 macbook pro,macbook pro 16,product_order,True
|
| 211 |
+
pixel pro 7 google,google pixel 7 pro,product_order,True
|
| 212 |
+
tab s9 samsung galaxy,samsung galaxy tab s9,product_order,True
|
| 213 |
+
12 mini iphone,iphone 12 mini,product_order,True
|
| 214 |
+
z fold 4 samsung,samsung galaxy z fold 4,product_order,True
|
| 215 |
+
watch series 9 apple,apple watch series 9,product_order,True
|
| 216 |
+
xl pixel 8 google,google pixel 8 xl,product_order,True
|
| 217 |
+
s24 ultra samsung,samsung s24 ultra,product_order,True
|
| 218 |
+
15 macbook air,macbook air 15,product_order,True
|
| 219 |
+
iphone pro 13,iphone 13 pro,product_order,True
|
| 220 |
+
flip 5 z samsung,samsung galaxy z flip 5,product_order,True
|
| 221 |
+
7 series watch apple,apple watch series 7,product_order,True
|
| 222 |
+
a15 oneplus,oneplus a15,product_order,True
|
| 223 |
+
pad air 11 ipad,ipad air 11,product_order,True
|
| 224 |
+
ultra 15 iphone pro,iphone 15 pro max,product_order,True
|
| 225 |
+
note 24 galaxy samsung,samsung galaxy note 24,product_order,True
|
| 226 |
+
11 pro max iphone,iphone 11 pro max,product_order,True
|
| 227 |
+
studio display apple,apple studio display,product_order,True
|
| 228 |
+
x1 carbon lenovo thinkpad,lenovo thinkpad x1 carbon,product_order,True
|
| 229 |
+
AAPL stock price,AAPL,stock_canon,True
|
| 230 |
+
tesla share price,TSLA,stock_canon,True
|
| 231 |
+
MSFT earnings,MSFT,stock_canon,True
|
| 232 |
+
google GOOGL stock,GOOGL,stock_canon,True
|
| 233 |
+
AMZN share,AMZN,stock_canon,True
|
| 234 |
+
amazon price AMZN,AMZN,stock_canon,True
|
| 235 |
+
META stock price,META,stock_canon,True
|
| 236 |
+
nvidia NVDA,NVDA,stock_canon,True
|
| 237 |
+
NFLX share price,NFLX,stock_canon,True
|
| 238 |
+
netflix stock NFLX,NFLX,stock_canon,True
|
| 239 |
+
PYPL price,PYPL,stock_canon,True
|
| 240 |
+
paypal PYPL stock,PYPL,stock_canon,True
|
| 241 |
+
SNAP stock,SNAP,stock_canon,True
|
| 242 |
+
snapchat SNAP,SNAP,stock_canon,True
|
| 243 |
+
AMD share price,AMD,stock_canon,True
|
| 244 |
+
amd processor stock,AMD,stock_canon,True
|
| 245 |
+
INTC earnings,INTC,stock_canon,True
|
| 246 |
+
intel INTC stock,INTC,stock_canon,True
|
| 247 |
+
QCOM price,QCOM,stock_canon,True
|
| 248 |
+
qualcomm QCOM,QCOM,stock_canon,True
|
| 249 |
+
UBER stock price,UBER,stock_canon,True
|
| 250 |
+
lyft LYFT share,LYFT,stock_canon,True
|
| 251 |
+
airbnb ABNB stock,ABNB,stock_canon,True
|
| 252 |
+
iphone15pro,iphone 15 pro,spacing,True
|
| 253 |
+
samsungz9,samsung z9,spacing,True
|
| 254 |
+
ipadair,ipad air,spacing,True
|
| 255 |
+
losangeles,los angeles,spacing,True
|
| 256 |
+
sanfrancisco,san francisco,spacing,True
|
| 257 |
+
nearbyshops,nearby shops,spacing,True
|
| 258 |
+
dellxps13,dell xps 13,spacing,True
|
| 259 |
+
surfacelaptopp5,surface laptop p5,spacing,True
|
| 260 |
+
newyorkpizza,new york pizza,spacing,True
|
| 261 |
+
holmescompany,holmes company,spacing,True
|
| 262 |
+
findme,find me,spacing,True
|
| 263 |
+
bostonma,boston ma,spacing,True
|
| 264 |
+
sanjose,san jose,spacing,True
|
| 265 |
+
pixelwatch2,pixel watch 2,spacing,True
|
| 266 |
+
openpizza,open pizza,spacing,True
|
| 267 |
+
northcarolina,north carolina,spacing,True
|
| 268 |
+
showevenear,show venues near,spacing,True
|
| 269 |
+
galax30series,galax 30 series,spacing,True
|
| 270 |
+
laptop,laptop,spacing,False
|
| 271 |
+
smartphone,smartphone,spacing,False
|
| 272 |
+
keyboard,keyboard,spacing,False
|
| 273 |
+
monitor,monitor,spacing,False
|
| 274 |
+
NYC,NYC,no_change,False
|
| 275 |
+
LA,LA,no_change,False
|
| 276 |
+
UK,UK,no_change,False
|
| 277 |
+
vue,vue,no_change,False
|
| 278 |
+
aws,aws,no_change,False
|
| 279 |
+
sql,sql,no_change,False
|
| 280 |
+
git,git,no_change,False
|
| 281 |
+
c,c,no_change,False
|
| 282 |
+
x,x,no_change,False
|
| 283 |
+
r,r,no_change,False
|
| 284 |
+
z,z,no_change,False
|
| 285 |
+
kafka,kafka,no_change,False
|
| 286 |
+
nginx,nginx,no_change,False
|
| 287 |
+
vim,vim,no_change,False
|
| 288 |
+
pdf,pdf,no_change,False
|
| 289 |
+
xml,xml,no_change,False
|
| 290 |
+
svg,svg,no_change,False
|
| 291 |
+
gcp,gcp,no_change,False
|
| 292 |
+
cli,cli,no_change,False
|
| 293 |
+
api,api,no_change,False
|
| 294 |
+
jwt,jwt,no_change,False
|
| 295 |
+
mvp,mvp,no_change,False
|
| 296 |
+
gdpr,gdpr,no_change,False
|
| 297 |
+
crm,crm,no_change,False
|
| 298 |
+
ux,ux,no_change,False
|
| 299 |
+
pwa,pwa,no_change,False
|
| 300 |
+
orm,orm,no_change,False
|
requirements.txt
CHANGED
|
@@ -1,3 +1,6 @@
|
|
| 1 |
-
|
| 2 |
pandas
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.32.0
|
| 2 |
pandas
|
| 3 |
+
symspellpy
|
| 4 |
+
rapidfuzz
|
| 5 |
+
python-Levenshtein
|
| 6 |
+
pyspellchecker
|
results.csv
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
noisy,canonical,category,should_change,pred,em,cer,wer
|
| 2 |
+
wheather nyc,weather nyc,single_typo,True,whether nyc,False,0.18181818181818182,0.5
|
| 3 |
+
calclator,calculator,single_typo,True,calculator,True,0.0,0.0
|
| 4 |
+
forcast london,forecast london,single_typo,True,forecast london,True,0.0,0.0
|
| 5 |
+
temprature converter,temperature converter,single_typo,True,temperature converter,True,0.0,0.0
|
| 6 |
+
restaurent near me,restaurant near me,single_typo,True,restaurant near me,True,0.0,0.0
|
| 7 |
+
translater english,translator english,single_typo,True,translate english,False,0.1111111111111111,0.5
|
| 8 |
+
defintion of entropy,definition of entropy,single_typo,True,definition of entropy,True,0.0,0.0
|
| 9 |
+
seperate the words,separate the words,single_typo,True,separate the words,True,0.0,0.0
|
| 10 |
+
accomodation paris,accommodation paris,single_typo,True,accommodation paris,True,0.0,0.0
|
| 11 |
+
recieve email,receive email,single_typo,True,receive email,True,0.0,0.0
|
| 12 |
+
suhsi near me,sushi near me,multi_typo,True,sushi near me,True,0.0,0.0
|
| 13 |
+
restarant near me,restaurant near me,multi_typo,True,restaurant near me,True,0.0,0.0
|
| 14 |
+
wether forcast today,weather forecast today,multi_typo,True,wether forecast today,False,0.045454545454545456,0.3333333333333333
|
| 15 |
+
plmber emergancy,plumber emergency,multi_typo,True,plumber emergency,True,0.0,0.0
|
| 16 |
+
nearist cofee shop,nearest coffee shop,multi_typo,True,nearest coffee shop,True,0.0,0.0
|
| 17 |
+
cheep flihgts paris,cheap flights paris,multi_typo,True,cheep flights paris,False,0.05263157894736842,0.3333333333333333
|
| 18 |
+
hotl delas nyc,hotel deals nyc,multi_typo,True,hotl delay nyc,False,0.26666666666666666,0.6666666666666666
|
| 19 |
+
hosptial emergancy rm,hospital emergency room,multi_typo,True,hospital emergency rm,False,0.08695652173913043,0.3333333333333333
|
| 20 |
+
bestbuyt,best buy,brand_typo,True,best buy,True,0.0,0.0
|
| 21 |
+
youtueb,youtube,brand_typo,True,youtube,True,0.0,0.0
|
| 22 |
+
gooogle maps,google maps,brand_typo,True,google maps,True,0.0,0.0
|
| 23 |
+
amazom prime,amazon prime,brand_typo,True,amazon prime,True,0.0,0.0
|
| 24 |
+
netflx login,netflix login,brand_typo,True,netflix login,True,0.0,0.0
|
| 25 |
+
spotifiy premium,spotify premium,brand_typo,True,spotifiy premium,False,0.0625,0.5
|
| 26 |
+
facbook login,facebook login,brand_typo,True,facebook login,True,0.0,0.0
|
| 27 |
+
instagrem,instagram,brand_typo,True,instagram,True,0.0,0.0
|
| 28 |
+
linkdin profile,linkedin profile,brand_typo,True,linking profile,False,0.1875,0.5
|
| 29 |
+
gitub repo,github repo,brand_typo,True,tub repo,False,0.2727272727272727,0.5
|
| 30 |
+
163 SQ,SQ163,flight_order,True,SQ163,True,0.0,0.0
|
| 31 |
+
100 AA,AA100,flight_order,True,AA100,True,0.0,0.0
|
| 32 |
+
417 BA,BA417,flight_order,True,BA417,True,0.0,0.0
|
| 33 |
+
SQ 163,SQ163,flight_order,True,SQ163,True,0.0,0.0
|
| 34 |
+
AA 100,AA100,flight_order,True,AA100,True,0.0,0.0
|
| 35 |
+
815 DL,DL815,flight_order,True,DL815,True,0.0,0.0
|
| 36 |
+
200 UA,UA200,flight_order,True,UA200,True,0.0,0.0
|
| 37 |
+
flight 163 SQ,SQ163,flight_order,True,SQ163,True,0.0,0.0
|
| 38 |
+
AA flight 100,AA100,flight_order,True,AA flight 100,False,0.6153846153846154,3.0
|
| 39 |
+
15 iphone,iphone 15,product_order,True,iphone 15,True,0.0,0.0
|
| 40 |
+
pro 14 macbook,macbook pro 14,product_order,True,macbook pro 14,True,0.0,0.0
|
| 41 |
+
s24 samsung,samsung s24,product_order,True,samsung s24,True,0.0,0.0
|
| 42 |
+
ultra s23 samsung,samsung s23 ultra,product_order,True,samsung ultra s23,False,0.47058823529411764,0.6666666666666666
|
| 43 |
+
air 13 macbook,macbook air 13,product_order,True,macbook air 13,True,0.0,0.0
|
| 44 |
+
pro ipad 12,ipad pro 12,product_order,True,ipad pro 12,True,0.0,0.0
|
| 45 |
+
max pro 15 iphone,iphone 15 pro max,product_order,True,iphone max pro 15,False,0.35294117647058826,0.5
|
| 46 |
+
pixel 8 google,google pixel 8,product_order,True,pixel 8 google,False,0.8571428571428571,0.6666666666666666
|
| 47 |
+
tab s9 samsung,samsung tab s9,product_order,True,samsung tab s9,True,0.0,0.0
|
| 48 |
+
AAPL stock,AAPL,stock_canon,True,AAPL,True,0.0,0.0
|
| 49 |
+
stock TSLA,TSLA,stock_canon,True,TSLA,True,0.0,0.0
|
| 50 |
+
apple aapl,AAPL,stock_canon,True,AAPL,True,0.0,0.0
|
| 51 |
+
tesla stock price,TSLA,stock_canon,True,TSLA,True,0.0,0.0
|
| 52 |
+
MSFT share price,MSFT,stock_canon,True,MSFT,True,0.0,0.0
|
| 53 |
+
google stock GOOGL,GOOGL,stock_canon,True,GOOGL,True,0.0,0.0
|
| 54 |
+
amazon AMZN stock,AMZN,stock_canon,True,AMZN,True,0.0,0.0
|
| 55 |
+
meta stock FB,META,stock_canon,True,META,True,0.0,0.0
|
| 56 |
+
nvda share price,NVDA,stock_canon,True,NVDA,True,0.0,0.0
|
| 57 |
+
iphone15,iphone 15,spacing,True,iphone 15,True,0.0,0.0
|
| 58 |
+
macbookpro,macbook pro,spacing,True,macbook pro,True,0.0,0.0
|
| 59 |
+
nearme,near me,spacing,True,near me,True,0.0,0.0
|
| 60 |
+
bestbuy,best buy,spacing,True,best buy,True,0.0,0.0
|
| 61 |
+
newyork,new york,spacing,True,new york,True,0.0,0.0
|
| 62 |
+
unitedstates,united states,spacing,True,united states,True,0.0,0.0
|
| 63 |
+
wifi password,wifi password,spacing,False,wifi password,True,0.0,0.0
|
| 64 |
+
hotdog,hotdog,spacing,False,hotdog,True,0.0,0.0
|
| 65 |
+
appl,appl,no_change,False,appl,True,0.0,0.0
|
| 66 |
+
rust,rust,no_change,False,rust,True,0.0,0.0
|
| 67 |
+
delta,delta,no_change,False,delta,True,0.0,0.0
|
| 68 |
+
apple,apple,no_change,False,apple,True,0.0,0.0
|
| 69 |
+
python,python,no_change,False,python,True,0.0,0.0
|
| 70 |
+
java,java,no_change,False,java,True,0.0,0.0
|
| 71 |
+
echo,echo,no_change,False,echo,True,0.0,0.0
|
| 72 |
+
spring,spring,no_change,False,spring,True,0.0,0.0
|
| 73 |
+
cloud,cloud,no_change,False,cloud,True,0.0,0.0
|
| 74 |
+
mercury,mercury,no_change,False,mercury,True,0.0,0.0
|
| 75 |
+
npm,npm,no_change,False,npm,True,0.0,0.0
|
| 76 |
+
gcc,gcc,no_change,False,gcc,True,0.0,0.0
|
| 77 |
+
css,css,no_change,False,css,True,0.0,0.0
|
| 78 |
+
go,go,no_change,False,go,True,0.0,0.0
|
| 79 |
+
swift,swift,no_change,False,swift,True,0.0,0.0
|
| 80 |
+
waether forecast tomorrow,weather forecast tomorrow,single_typo,True,whether forecast tomorrow,False,0.08,0.3333333333333333
|
| 81 |
+
best pizzeria near me,best pizzeria near me,single_typo,False,best pizzeria near me,True,0.0,0.0
|
| 82 |
+
how to cook pasta,how to cook pasta,single_typo,False,how to cook pasta,True,0.0,0.0
|
| 83 |
+
gas stations nearby,gas stations nearby,single_typo,False,gas stations nearby,True,0.0,0.0
|
| 84 |
+
resturant reservations online,restaurant reservations online,single_typo,True,restaurant reservations online,True,0.0,0.0
|
| 85 |
+
puplic libraries near me,public libraries near me,single_typo,True,public libraries near me,True,0.0,0.0
|
| 86 |
+
best plumber in my area,best plumber in my area,single_typo,False,best plumber in my area,True,0.0,0.0
|
| 87 |
+
forcast weekend weather,forecast weekend weather,single_typo,True,forecast weekend weather,True,0.0,0.0
|
| 88 |
+
how to pronounce worcester,how to pronounce worcester,single_typo,False,how to pronounce worcester,True,0.0,0.0
|
| 89 |
+
recipie for chocolate cake,recipe for chocolate cake,single_typo,True,recipe for chocolate cake,True,0.0,0.0
|
| 90 |
+
hardware store locator,hardware store locator,single_typo,False,hardware store locator,True,0.0,0.0
|
| 91 |
+
trafic conditions now,traffic conditions now,single_typo,True,traffic conditions now,True,0.0,0.0
|
| 92 |
+
vacuum cleaner reviews,vacuum cleaner reviews,single_typo,False,vacuum cleaner reviews,True,0.0,0.0
|
| 93 |
+
how to spell occassion,how to spell occasion,single_typo,True,how to spell occasion,True,0.0,0.0
|
| 94 |
+
dentist appointements available,dentist appointments available,single_typo,True,dentist appointments available,True,0.0,0.0
|
| 95 |
+
nearest pharmacy open now,nearest pharmacy open now,single_typo,False,nearest pharmacy open now,True,0.0,0.0
|
| 96 |
+
beginner gardening tips,beginner gardening tips,single_typo,False,beginner gardening tips,True,0.0,0.0
|
| 97 |
+
calorie counter app,calorie counter app,single_typo,False,calorie counter app,True,0.0,0.0
|
| 98 |
+
what is the defintion of serendipity,what is the definition of serendipity,single_typo,True,what is the definition of serendipity,True,0.0,0.0
|
| 99 |
+
best electrician in town,best electrician in town,single_typo,False,best electrician in town,True,0.0,0.0
|
| 100 |
+
humidty levels today,humidity levels today,single_typo,True,humidity levels today,True,0.0,0.0
|
| 101 |
+
directions to airport,directions to airport,single_typo,False,directions to airport,True,0.0,0.0
|
| 102 |
+
how to spel definitely,how to spell definitely,single_typo,True,how to spel definitely,False,0.043478260869565216,0.25
|
| 103 |
+
grocery stores near me,grocery stores near me,single_typo,False,grocery stores near me,True,0.0,0.0
|
| 104 |
+
buisness hours for target,business hours for target,single_typo,True,business hours for target,True,0.0,0.0
|
| 105 |
+
weather in chicago tommorow,weather in chicago tomorrow,single_typo,True,weather in chicago tomorrow,True,0.0,0.0
|
| 106 |
+
how to make omlette,how to make omelette,single_typo,True,how to make omelette,True,0.0,0.0
|
| 107 |
+
atm machine locations,atm machine locations,single_typo,False,atm machine locations,True,0.0,0.0
|
| 108 |
+
barber shop avaialble,barber shop available,single_typo,True,barber shop available,True,0.0,0.0
|
| 109 |
+
best restauarant in boston,best restaurant in boston,single_typo,True,best restaurant in boston,True,0.0,0.0
|
| 110 |
+
how to cook brocoli,how to cook broccoli,single_typo,True,how to cook broccoli,True,0.0,0.0
|
| 111 |
+
swimming pools near me,swimming pools near me,single_typo,False,swimming pools near me,True,0.0,0.0
|
| 112 |
+
seperate the documents,separate the documents,single_typo,True,separate the documents,True,0.0,0.0
|
| 113 |
+
temperture in fahrenheit,temperature in fahrenheit,single_typo,True,temperature in fahrenheit,True,0.0,0.0
|
| 114 |
+
parking garage nearby,parking garage nearby,single_typo,False,parking garage nearby,True,0.0,0.0
|
| 115 |
+
how to make lasagna recepie,how to make lasagna recipe,single_typo,True,how to make lasagna receive,False,0.07407407407407407,0.2
|
| 116 |
+
veterinary clinic hours,veterinary clinic hours,single_typo,False,veterinary clinic hours,True,0.0,0.0
|
| 117 |
+
what does recieve mean,what does receive mean,single_typo,True,what does receive mean,True,0.0,0.0
|
| 118 |
+
yoga classes availible now,yoga classes available now,single_typo,True,yoga classes available now,True,0.0,0.0
|
| 119 |
+
begininng spanish lessons,beginning spanish lessons,single_typo,True,beginning spanish lessons,True,0.0,0.0
|
| 120 |
+
plumber near me emergancy,plumber near me emergency,multi_typo,True,plumber near me emergency,True,0.0,0.0
|
| 121 |
+
dentist appoitment availble,dentist appointment available,multi_typo,True,dentist appointment available,True,0.0,0.0
|
| 122 |
+
electritian repaire servises,electrician repair services,multi_typo,True,electrician repair services,True,0.0,0.0
|
| 123 |
+
weather forcast this weakend,weather forecast this weekend,multi_typo,True,weather forecast this weekend,True,0.0,0.0
|
| 124 |
+
neerest gass station,nearest gas station,multi_typo,True,nearest gass station,False,0.05,0.3333333333333333
|
| 125 |
+
hotel reservation cheep rates,hotel reservation cheap rates,multi_typo,True,hotel reservation cheep rates,False,0.034482758620689655,0.25
|
| 126 |
+
autombile mechanic lokation,automobile mechanic location,multi_typo,True,automobile mechanic location,True,0.0,0.0
|
| 127 |
+
humidty forcast tomorow,humidity forecast tomorrow,multi_typo,True,humidity forecast tomorrow,True,0.0,0.0
|
| 128 |
+
locksmith emergancy servise,locksmith emergency service,multi_typo,True,locksmith emergency service,True,0.0,0.0
|
| 129 |
+
flight ticket prices comparision,flight ticket prices comparison,multi_typo,True,flight ticket prices comparison,True,0.0,0.0
|
| 130 |
+
restauant reservaton opem,restaurant reservation open,multi_typo,True,restaurant reservation opem,False,0.037037037037037035,0.3333333333333333
|
| 131 |
+
carpentor contrator estimat,carpenter contractor estimate,multi_typo,True,carpenter contractor estimate,True,0.0,0.0
|
| 132 |
+
tempreture alert wheather,temperature alert weather,multi_typo,True,temperature alert whether,False,0.08,0.3333333333333333
|
| 133 |
+
tourist atraction guidebook,tourist attraction guidebook,multi_typo,True,tourist attraction guidebook,True,0.0,0.0
|
| 134 |
+
laundry servise neerby,laundry service nearby,multi_typo,True,laundry service nearby,True,0.0,0.0
|
| 135 |
+
vehcile registation renewel,vehicle registration renewal,multi_typo,True,vehicle registration renewed,False,0.07142857142857142,0.3333333333333333
|
| 136 |
+
snowstrom warning forcast,snowstorm warning forecast,multi_typo,True,snowstorm warning forecast,True,0.0,0.0
|
| 137 |
+
hostotel accomodation deals,hostel accommodation deals,multi_typo,True,hostel accommodation deals,True,0.0,0.0
|
| 138 |
+
haircut appoitment schedul,haircut appointment schedule,multi_typo,True,haircut appointment schedule,True,0.0,0.0
|
| 139 |
+
sunrise time locaton,sunrise time location,multi_typo,True,sunrise time location,True,0.0,0.0
|
| 140 |
+
moving compeny quoate,moving company quote,multi_typo,True,moving company quote,True,0.0,0.0
|
| 141 |
+
road conidtion trafic update,road condition traffic update,multi_typo,True,road condition traffic update,True,0.0,0.0
|
| 142 |
+
veterinarian emergancy clinc,veterinarian emergency clinic,multi_typo,True,veterinarian emergency clinic,True,0.0,0.0
|
| 143 |
+
vacation packge discunt availble,vacation package discount available,multi_typo,True,vacation package discount available,True,0.0,0.0
|
| 144 |
+
pest contral servise lokation,pest control service location,multi_typo,True,pest control service location,True,0.0,0.0
|
| 145 |
+
pollin forcast alergy,pollen forecast allergy,multi_typo,True,pollen forecast allergy,True,0.0,0.0
|
| 146 |
+
airbnb accomodaton recomendation,airbnb accommodation recommendation,multi_typo,True,airing accommodation recommendation,False,0.05714285714285714,0.3333333333333333
|
| 147 |
+
window cleening compny rates,window cleaning company rates,multi_typo,True,window cleaning company rates,True,0.0,0.0
|
| 148 |
+
wind gust wheather alert,wind gust weather alert,multi_typo,True,wind gust whether alert,False,0.08695652173913043,0.25
|
| 149 |
+
rentral car comparision price,rental car comparison price,multi_typo,True,central car comparison price,False,0.07142857142857142,0.25
|
| 150 |
+
goggle.com,google.com,brand_typo,True,goggle com,False,0.2,2.0
|
| 151 |
+
amazn.com,amazon.com,brand_typo,True,amazon com,False,0.1,2.0
|
| 152 |
+
spotfiy music,spotify music,brand_typo,True,spotty music,False,0.15384615384615385,0.5
|
| 153 |
+
instgram app,instagram app,brand_typo,True,ingram app,False,0.23076923076923078,0.5
|
| 154 |
+
gitub profile,github profile,brand_typo,True,tub profile,False,0.21428571428571427,0.5
|
| 155 |
+
redditt.com,reddit.com,brand_typo,True,reddish com,False,0.2727272727272727,2.0
|
| 156 |
+
twiter feed,twitter feed,brand_typo,True,twitter feed,True,0.0,0.0
|
| 157 |
+
linkdin jobs,linkedin jobs,brand_typo,True,linking jobs,False,0.23076923076923078,0.5
|
| 158 |
+
microsodt office,microsoft office,brand_typo,True,microsoft office,True,0.0,0.0
|
| 159 |
+
adoobe creative,adobe creative,brand_typo,True,adobe creative,True,0.0,0.0
|
| 160 |
+
dropbx files,dropbox files,brand_typo,True,drop files,False,0.23076923076923078,0.5
|
| 161 |
+
zom meeting,zoom meeting,brand_typo,True,zom meeting,False,0.08333333333333333,0.5
|
| 162 |
+
slck workspace,slack workspace,brand_typo,True,slck workspace,False,0.06666666666666667,0.5
|
| 163 |
+
paypa checkout,paypal checkout,brand_typo,True,papa checkout,False,0.13333333333333333,0.5
|
| 164 |
+
ebya auction,ebay auction,brand_typo,True,ebya auction,False,0.16666666666666666,0.5
|
| 165 |
+
wallmart groceries,walmart groceries,brand_typo,True,walmart groceries,True,0.0,0.0
|
| 166 |
+
targat deals,target deals,brand_typo,True,target deals,True,0.0,0.0
|
| 167 |
+
nytimez news,nytimes news,brand_typo,True,anytime news,False,0.16666666666666666,0.5
|
| 168 |
+
bbc.co.uk,bbc.com,brand_typo,True,bic co us,False,0.5555555555555556,3.0
|
| 169 |
+
cnn breaking,cnn breaking news,brand_typo,True,cnn breaking,False,0.29411764705882354,0.3333333333333333
|
| 170 |
+
youtub video,youtube video,brand_typo,True,youtube video,True,0.0,0.0
|
| 171 |
+
netflic series,netflix series,brand_typo,True,netflix series,True,0.0,0.0
|
| 172 |
+
googl drive,google drive,brand_typo,True,GOOGL,False,0.5833333333333334,1.0
|
| 173 |
+
amzon shopping,amazon shopping,brand_typo,True,amazon shopping,True,0.0,0.0
|
| 174 |
+
spotiffy playlist,spotify playlist,brand_typo,True,spiffy playlets,False,0.375,1.0
|
| 175 |
+
facebk messenger,facebook messenger,brand_typo,True,face messenger,False,0.2222222222222222,0.5
|
| 176 |
+
insta stories,instagram stories,brand_typo,True,instar stories,False,0.17647058823529413,0.5
|
| 177 |
+
gihub code,github code,brand_typo,True,hub code,False,0.2727272727272727,0.5
|
| 178 |
+
reddot forum,reddit forum,brand_typo,True,redo forum,False,0.25,0.5
|
| 179 |
+
BA 287,BA287,flight_order,True,BA287,True,0.0,0.0
|
| 180 |
+
502 DL,DL502,flight_order,True,DL502,True,0.0,0.0
|
| 181 |
+
flight UA 441,UA441,flight_order,True,UA441,True,0.0,0.0
|
| 182 |
+
lh 156,LH156,flight_order,True,LH156,True,0.0,0.0
|
| 183 |
+
273 AF,AF273,flight_order,True,AF273,True,0.0,0.0
|
| 184 |
+
EK 89,EK89,flight_order,True,EK89,True,0.0,0.0
|
| 185 |
+
621qr,QR621,flight_order,True,QR621,True,0.0,0.0
|
| 186 |
+
CX 884,CX884,flight_order,True,CX884,True,0.0,0.0
|
| 187 |
+
345 vs,VS345,flight_order,True,VS345,True,0.0,0.0
|
| 188 |
+
KL 714,KL714,flight_order,True,KL714,True,0.0,0.0
|
| 189 |
+
193ib,IB193,flight_order,True,IB193,True,0.0,0.0
|
| 190 |
+
TK 427,TK427,flight_order,True,TK427,True,0.0,0.0
|
| 191 |
+
flight 556 AA,AA556,flight_order,True,AA556,True,0.0,0.0
|
| 192 |
+
738 ba,BA738,flight_order,True,BA738,True,0.0,0.0
|
| 193 |
+
DL 212,DL212,flight_order,True,DL212,True,0.0,0.0
|
| 194 |
+
84ua,UA84,flight_order,True,UA84,True,0.0,0.0
|
| 195 |
+
AF 609,AF609,flight_order,True,AF609,True,0.0,0.0
|
| 196 |
+
445 ek,EK445,flight_order,True,EK445,True,0.0,0.0
|
| 197 |
+
sq 267,SQ267,flight_order,True,SQ267,True,0.0,0.0
|
| 198 |
+
572 CX,CX572,flight_order,True,CX572,True,0.0,0.0
|
| 199 |
+
flight vs314,VS314,flight_order,True,flight vs314,False,0.5833333333333334,1.0
|
| 200 |
+
981 KL,KL981,flight_order,True,KL981,True,0.0,0.0
|
| 201 |
+
IB 456,IB456,flight_order,True,IB456,True,0.0,0.0
|
| 202 |
+
tk 103,TK103,flight_order,True,TK103,True,0.0,0.0
|
| 203 |
+
890 SQ,SQ890,flight_order,True,SQ890,True,0.0,0.0
|
| 204 |
+
13 air macbook,macbook air 13,product_order,True,macbook 13 air,False,0.42857142857142855,0.6666666666666666
|
| 205 |
+
pro iphone 15,iphone 15 pro,product_order,True,iphone pro 15,False,0.46153846153846156,0.6666666666666666
|
| 206 |
+
8 pixel google,google pixel 8,product_order,True,pixel 8 google,False,0.8571428571428571,0.6666666666666666
|
| 207 |
+
ultra 24 s23 samsung,samsung s23 ultra,product_order,True,samsung ultra 24 s23,False,0.55,1.0
|
| 208 |
+
fold 5 samsung galaxy,samsung galaxy fold 5,product_order,True,samsung fold 5 galaxy,False,0.47619047619047616,0.5
|
| 209 |
+
max 14 pro iphone,iphone 14 pro max,product_order,True,iphone max 14 pro,False,0.47058823529411764,0.5
|
| 210 |
+
16 macbook pro,macbook pro 16,product_order,True,macbook 16 pro,False,0.42857142857142855,0.6666666666666666
|
| 211 |
+
pixel pro 7 google,google pixel 7 pro,product_order,True,pixel pro 7 google,False,0.7777777777777778,0.75
|
| 212 |
+
tab s9 samsung galaxy,samsung galaxy tab s9,product_order,True,samsung tab s9 galaxy,False,0.47619047619047616,0.5
|
| 213 |
+
12 mini iphone,iphone 12 mini,product_order,True,iphone 12 mini,True,0.0,0.0
|
| 214 |
+
z fold 4 samsung,samsung galaxy z fold 4,product_order,True,samsung z fold 4,False,0.30434782608695654,0.2
|
| 215 |
+
watch series 9 apple,apple watch series 9,product_order,True,watch series 9 apple,False,0.6,0.5
|
| 216 |
+
xl pixel 8 google,google pixel 8 xl,product_order,True,pixel xl 8 google,False,0.7647058823529411,0.75
|
| 217 |
+
s24 ultra samsung,samsung s24 ultra,product_order,True,samsung s24 ultra,True,0.0,0.0
|
| 218 |
+
15 macbook air,macbook air 15,product_order,True,macbook 15 air,False,0.42857142857142855,0.6666666666666666
|
| 219 |
+
iphone pro 13,iphone 13 pro,product_order,True,iphone pro 13,False,0.46153846153846156,0.6666666666666666
|
| 220 |
+
flip 5 z samsung,samsung galaxy z flip 5,product_order,True,samsung flip 5 z,False,0.4782608695652174,0.6
|
| 221 |
+
7 series watch apple,apple watch series 7,product_order,True,7 series watch apple,False,0.7,1.0
|
| 222 |
+
a15 oneplus,oneplus a15,product_order,True,a15 onerous,False,0.9090909090909091,1.0
|
| 223 |
+
pad air 11 ipad,ipad air 11,product_order,True,ipad pad air 11,False,0.26666666666666666,0.3333333333333333
|
| 224 |
+
ultra 15 iphone pro,iphone 15 pro max,product_order,True,iphone ultra 15 pro,False,0.5263157894736842,0.5
|
| 225 |
+
note 24 galaxy samsung,samsung galaxy note 24,product_order,True,samsung note 24 galaxy,False,0.6363636363636364,0.5
|
| 226 |
+
11 pro max iphone,iphone 11 pro max,product_order,True,iphone 11 pro max,True,0.0,0.0
|
| 227 |
+
studio display apple,apple studio display,product_order,True,studio display apple,False,0.6,0.6666666666666666
|
| 228 |
+
x1 carbon lenovo thinkpad,lenovo thinkpad x1 carbon,product_order,True,x1 carbon lenore thinkpad,False,0.88,1.0
|
| 229 |
+
AAPL stock price,AAPL,stock_canon,True,AAPL,True,0.0,0.0
|
| 230 |
+
tesla share price,TSLA,stock_canon,True,TSLA,True,0.0,0.0
|
| 231 |
+
MSFT earnings,MSFT,stock_canon,True,MSFT,True,0.0,0.0
|
| 232 |
+
google GOOGL stock,GOOGL,stock_canon,True,GOOGL,True,0.0,0.0
|
| 233 |
+
AMZN share,AMZN,stock_canon,True,AMZN,True,0.0,0.0
|
| 234 |
+
amazon price AMZN,AMZN,stock_canon,True,AMZN,True,0.0,0.0
|
| 235 |
+
META stock price,META,stock_canon,True,META,True,0.0,0.0
|
| 236 |
+
nvidia NVDA,NVDA,stock_canon,True,NVDA,True,0.0,0.0
|
| 237 |
+
NFLX share price,NFLX,stock_canon,True,NFLX,True,0.0,0.0
|
| 238 |
+
netflix stock NFLX,NFLX,stock_canon,True,NFLX,True,0.0,0.0
|
| 239 |
+
PYPL price,PYPL,stock_canon,True,PYPL,True,0.0,0.0
|
| 240 |
+
paypal PYPL stock,PYPL,stock_canon,True,PYPL,True,0.0,0.0
|
| 241 |
+
SNAP stock,SNAP,stock_canon,True,SNAP,True,0.0,0.0
|
| 242 |
+
snapchat SNAP,SNAP,stock_canon,True,SNAP,True,0.0,0.0
|
| 243 |
+
AMD share price,AMD,stock_canon,True,AMD,True,0.0,0.0
|
| 244 |
+
amd processor stock,AMD,stock_canon,True,AMD,True,0.0,0.0
|
| 245 |
+
INTC earnings,INTC,stock_canon,True,INTC,True,0.0,0.0
|
| 246 |
+
intel INTC stock,INTC,stock_canon,True,INTC,True,0.0,0.0
|
| 247 |
+
QCOM price,QCOM,stock_canon,True,QCOM price,False,0.6,1.0
|
| 248 |
+
qualcomm QCOM,QCOM,stock_canon,True,qualcomm QCOM,False,0.6923076923076923,1.0
|
| 249 |
+
UBER stock price,UBER,stock_canon,True,UBER,True,0.0,0.0
|
| 250 |
+
lyft LYFT share,LYFT,stock_canon,True,LYFT,True,0.0,0.0
|
| 251 |
+
airbnb ABNB stock,ABNB,stock_canon,True,ABNB,True,0.0,0.0
|
| 252 |
+
iphone15pro,iphone 15 pro,spacing,True,iphone pro,False,0.23076923076923078,0.3333333333333333
|
| 253 |
+
samsungz9,samsung z9,spacing,True,samsung,False,0.3,0.5
|
| 254 |
+
ipadair,ipad air,spacing,True,ipad air,True,0.0,0.0
|
| 255 |
+
losangeles,los angeles,spacing,True,los angeles,True,0.0,0.0
|
| 256 |
+
sanfrancisco,san francisco,spacing,True,san francisco,True,0.0,0.0
|
| 257 |
+
nearbyshops,nearby shops,spacing,True,nearby shops,True,0.0,0.0
|
| 258 |
+
dellxps13,dell xps 13,spacing,True,dell see,False,0.45454545454545453,0.6666666666666666
|
| 259 |
+
surfacelaptopp5,surface laptop p5,spacing,True,surface laptop,False,0.17647058823529413,0.3333333333333333
|
| 260 |
+
newyorkpizza,new york pizza,spacing,True,new pizza,False,0.35714285714285715,0.3333333333333333
|
| 261 |
+
holmescompany,holmes company,spacing,True,holmes company,True,0.0,0.0
|
| 262 |
+
findme,find me,spacing,True,find me,True,0.0,0.0
|
| 263 |
+
bostonma,boston ma,spacing,True,boston a,False,0.1111111111111111,0.5
|
| 264 |
+
sanjose,san jose,spacing,True,san jose,True,0.0,0.0
|
| 265 |
+
pixelwatch2,pixel watch 2,spacing,True,pixel watch,False,0.15384615384615385,0.3333333333333333
|
| 266 |
+
openpizza,open pizza,spacing,True,open pizza,True,0.0,0.0
|
| 267 |
+
northcarolina,north carolina,spacing,True,north carolina,True,0.0,0.0
|
| 268 |
+
showevenear,show venues near,spacing,True,showed near,False,0.375,0.6666666666666666
|
| 269 |
+
galax30series,galax 30 series,spacing,True,galaxy series,False,0.2,0.6666666666666666
|
| 270 |
+
laptop,laptop,spacing,False,laptop,True,0.0,0.0
|
| 271 |
+
smartphone,smartphone,spacing,False,smartphone,True,0.0,0.0
|
| 272 |
+
keyboard,keyboard,spacing,False,keyboard,True,0.0,0.0
|
| 273 |
+
monitor,monitor,spacing,False,monitor,True,0.0,0.0
|
| 274 |
+
NYC,NYC,no_change,False,NYC,True,0.0,0.0
|
| 275 |
+
LA,LA,no_change,False,LA,True,0.0,0.0
|
| 276 |
+
UK,UK,no_change,False,UK,True,0.0,0.0
|
| 277 |
+
vue,vue,no_change,False,vue,True,0.0,0.0
|
| 278 |
+
aws,aws,no_change,False,aws,True,0.0,0.0
|
| 279 |
+
sql,sql,no_change,False,sql,True,0.0,0.0
|
| 280 |
+
git,git,no_change,False,git,True,0.0,0.0
|
| 281 |
+
c,c,no_change,False,c,True,0.0,0.0
|
| 282 |
+
x,x,no_change,False,x,True,0.0,0.0
|
| 283 |
+
r,r,no_change,False,r,True,0.0,0.0
|
| 284 |
+
z,z,no_change,False,z,True,0.0,0.0
|
| 285 |
+
kafka,kafka,no_change,False,kafka,True,0.0,0.0
|
| 286 |
+
nginx,nginx,no_change,False,nine,False,0.4,1.0
|
| 287 |
+
vim,vim,no_change,False,vim,True,0.0,0.0
|
| 288 |
+
pdf,pdf,no_change,False,pdf,True,0.0,0.0
|
| 289 |
+
xml,xml,no_change,False,xml,True,0.0,0.0
|
| 290 |
+
svg,svg,no_change,False,svg,True,0.0,0.0
|
| 291 |
+
gcp,gcp,no_change,False,gcp,True,0.0,0.0
|
| 292 |
+
cli,cli,no_change,False,cli,True,0.0,0.0
|
| 293 |
+
api,api,no_change,False,api,True,0.0,0.0
|
| 294 |
+
jwt,jwt,no_change,False,jwt,True,0.0,0.0
|
| 295 |
+
mvp,mvp,no_change,False,mvp,True,0.0,0.0
|
| 296 |
+
gdpr,gdpr,no_change,False,gdpr,True,0.0,0.0
|
| 297 |
+
crm,crm,no_change,False,crm,True,0.0,0.0
|
| 298 |
+
ux,ux,no_change,False,ux,True,0.0,0.0
|
| 299 |
+
pwa,pwa,no_change,False,pwa,True,0.0,0.0
|
| 300 |
+
orm,orm,no_change,False,orm,True,0.0,0.0
|