Spaces:

vazish
/

query_norm

Running

App Files Files Community

vazish commited on 5 days ago

Commit

7b10861

unverified ·

1 Parent(s): 540ec31

query normalization

Browse files

Files changed (5) hide show

app.py +305 -0
benchmark.py +820 -0
dataset.csv +300 -0
requirements.txt +5 -2
results.csv +300 -0

app.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import time
+from pathlib import Path
+import pandas as pd
+import streamlit as st
+st.set_page_config(
+    page_title="Firefox Query Normalizer",
+    page_icon="🔍",
+    layout="wide",
+)
+HERE = Path(__file__).parent
+# ─── Normalizer (loaded once, cached across reruns) ───────────────────────────
+@st.cache_resource(show_spinner="Loading normalizer…")
+def load_normalizer():
+    from benchmark import CombinedV2Normalizer  # noqa: PLC0415
+    return CombinedV2Normalizer()
+# ─── Data ─────────────────────────────────────────────────────────────────────
+@st.cache_data
+def load_data() -> pd.DataFrame:
+    df = pd.read_csv(HERE / "results.csv")
+    df["should_change"] = df["should_change"].astype(bool)
+    df["em"]            = df["em"].astype(bool)
+    df["outcome"]       = df.apply(_classify_outcome, axis=1)
+    return df
+def _classify_outcome(row) -> str:
+    if row["should_change"]:
+        if row["em"]:
+            return "✅ Fixed correctly"
+        elif str(row["pred"]).strip().lower() == str(row["noisy"]).strip().lower():
+            return "❌ Not fixed"
+        else:
+            return "⚠️ Fixed incorrectly"
+    else:
+        return "✅ Left unchanged" if row["em"] else "❌ Over-corrected"
+CATEGORY_INFO: dict[str, tuple[str, str]] = {
+    "single_typo":   ("✏️ Single Typo",    "One misspelled word (e.g. 'wheather' → 'weather')"),
+    "multi_typo":    ("✏️ Multi Typo",     "Two or more typos in the same query"),
+    "brand_typo":    ("🏷️ Brand Typo",     "Brand name misspelled (e.g. 'bestbuyt' → 'best buy')"),
+    "flight_order":  ("✈️ Flight Order",   "Flight number tokens reordered (e.g. '163 SQ' → 'SQ163')"),
+    "product_order": ("📱 Product Order",  "Product tokens reordered (e.g. '15 iphone' → 'iphone 15')"),
+    "stock_canon":   ("📈 Stock Ticker",   "Stock query → ticker only (e.g. 'AAPL stock' → 'AAPL')"),
+    "spacing":       ("⎵  Spacing",        "Missing spaces fixed (e.g. 'nearme' → 'near me')"),
+    "no_change":     ("🔒 No Change",      "Should not be modified — tests over-correction resistance"),
+}
+OUTCOME_ORDER = [
+    "✅ Fixed correctly",
+    "✅ Left unchanged",
+    "❌ Not fixed",
+    "⚠️ Fixed incorrectly",
+    "❌ Over-corrected",
+]
+# ─── Header ───────────────────────────────────────────────────────────────────
+st.title("🔍 Query Normalizer")
+st.caption("**CombinedV2** pipeline · Preprocessing stage for Merino intent classification")
+with st.expander("ℹ️ What is this and why does it matter?", expanded=False):
+    st.markdown("""
+    Intent detection tries to classify user queries by intents —
+    navigational, local, commercial, etc. — to surface the right suggestions.
+    Real queries are noisy: users make typos, omit spaces, or enter tokens in the
+    wrong order.
+    **CombinedV2** is a lightweight rule + dictionary normalizer that runs in **< 1 ms**
+    per query. It runs 4 steps in sequence and short-circuits as soon as a fix is made:
+    | Step | What it handles | Example |
+    |------|----------------|---------|
+    | **1 · Rules** | Flight IDs, stock tickers, product token reordering | `163 SQ` → `SQ163` |
+    | **2 · RapidFuzz** | Fuzzy brand matching (single-token only) | `bestbuyt` → `best buy` |
+    | **3 · SymSpell** | Concatenated word splitting | `nearme` → `near me` |
+    | **4 · GuardedPySpell** | Spell correction (skips ≤4-char tokens & ALL_CAPS) | `wheather nyc` → `weather nyc` |
+    **Benchmark results across 299 queries in 8 categories:**
+    | Metric | Score |
+    |--------|-------|
+    | Exact match on queries that need fixing | **73.2%** |
+    | Precision on queries that should NOT change | **98.5%** |
+    | Median latency (p50) | **0.03 ms** |
+    """)
+st.divider()
+# ─── Tabs ─────────────────────────────────────────────────────────────────────
+tab_try, tab_browse, tab_perf = st.tabs(["🔤 Try It", "📋 Browse Examples", "📊 Performance"])
+# ══════════════════════════════════════════════════════════════════════
+# TAB 1 — Try It
+# ══════════════════════════════════════════════════════════════════════
+with tab_try:
+    norm = load_normalizer()
+    df   = load_data()
+    # ── Free-form input (prominent) ───────────────────────────────────
+    st.subheader("Type a query to normalize")
+    st.caption("Try typos, missing spaces, scrambled product names, flight numbers, stock tickers…")
+    user_query = st.text_input(
+        "Query input",
+        placeholder="e.g.  wheather nyc  ·  163 SQ  ·  bestbuyt  ·  nearme  ·  15 iphone  ·  AAPL stock",
+        label_visibility="collapsed",
+        key="user_query",
+    )
+    if user_query.strip():
+        t0 = time.perf_counter()
+        result = norm.normalize(user_query.strip())
+        elapsed_ms = (time.perf_counter() - t0) * 1000
+        res_col, meta_col = st.columns([3, 1])
+        with res_col:
+            if result.lower() == user_query.strip().lower():
+                st.success(f"**`{user_query.strip()}`** → no change needed → **`{result}`**")
+            else:
+                st.info(f"**`{user_query.strip()}`** → **`{result}`**")
+        with meta_col:
+            st.metric("Latency", f"{elapsed_ms:.2f} ms")
+        # Check if it's in the benchmark dataset
+        match = df[df["noisy"].str.lower() == user_query.strip().lower()]
+        if len(match):
+            row = match.iloc[0]
+            cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0]
+            if result == row["canonical"]:
+                note = f"✅ Matches expected output `{row['canonical']}`"
+            else:
+                note = f"Expected `{row['canonical']}` · benchmark outcome: **{row['outcome']}**"
+            st.caption(f"_Found in benchmark · {cat_label} · {note}_")
+    st.divider()
+    # ── Example picker ────────────────────────────────────────────────
+    st.subheader("Or pick an example from the benchmark")
+    pick_col1, pick_col2 = st.columns(2)
+    with pick_col1:
+        cat_pick = st.selectbox(
+            "Category",
+            ["All"] + list(CATEGORY_INFO.keys()),
+            format_func=lambda k: "All categories" if k == "All" else CATEGORY_INFO[k][0],
+            key="cat_pick",
+        )
+    with pick_col2:
+        show_errors_only = st.checkbox("Errors / failures only", value=False)
+    sub = df if cat_pick == "All" else df[df["category"] == cat_pick]
+    if show_errors_only:
+        sub = sub[~sub["em"]]
+    if len(sub) == 0:
+        st.info("No examples match these filters.")
+    else:
+        example_labels = [
+            f"{row.noisy}   [{CATEGORY_INFO.get(row.category, (row.category,''))[0]}]"
+            for row in sub.itertuples()
+        ]
+        picked_label = st.selectbox("Example", example_labels, key="example_pick")
+        picked_noisy = picked_label.split("   [")[0]
+        row = sub[sub["noisy"] == picked_noisy].iloc[0]
+        ex_left, ex_right = st.columns([3, 1])
+        with ex_left:
+            t0 = time.perf_counter()
+            ex_result  = norm.normalize(picked_noisy)
+            elapsed_ms = (time.perf_counter() - t0) * 1000
+            st.markdown(f"**Input:** `{picked_noisy}`")
+            st.markdown(f"**Expected:** `{row['canonical']}`")
+            if ex_result == row["canonical"]:
+                st.success(f"**Got:** `{ex_result}` ✅")
+            elif ex_result.lower() == picked_noisy.lower():
+                st.error(f"**Got:** `{ex_result}` — normalizer didn't fix it")
+            else:
+                st.warning(f"**Got:** `{ex_result}` — expected `{row['canonical']}`")
+        with ex_right:
+            st.metric("Latency", f"{elapsed_ms:.2f} ms")
+            cat_label = CATEGORY_INFO.get(row["category"], (row["category"], ""))[0]
+            st.caption(cat_label)
+            st.caption(CATEGORY_INFO.get(row["category"], ("", row["category"]))[1])
+# ══════════════════════════════════════════════════════════════════════
+# TAB 2 — Browse Examples
+# ══════════════════════════════════════════════════════════════════════
+with tab_browse:
+    df = load_data()
+    f1, f2 = st.columns(2)
+    with f1:
+        cats = st.multiselect(
+            "Categories",
+            options=list(CATEGORY_INFO.keys()),
+            default=list(CATEGORY_INFO.keys()),
+            format_func=lambda k: CATEGORY_INFO[k][0],
+        )
+    with f2:
+        outcomes = st.multiselect(
+            "Outcomes",
+            options=OUTCOME_ORDER,
+            default=OUTCOME_ORDER,
+        )
+    filtered = df[df["category"].isin(cats) & df["outcome"].isin(outcomes)]
+    st.caption(f"Showing **{len(filtered)}** of {len(df)} examples")
+    display = filtered[["noisy", "pred", "canonical", "category", "outcome"]].copy()
+    display.columns = ["Input (noisy)", "Predicted", "Expected", "Category", "Outcome"]
+    display["Category"] = display["Category"].map(
+        lambda k: CATEGORY_INFO.get(k, (k, ""))[0]
+    )
+    st.dataframe(
+        display,
+        use_container_width=True,
+        hide_index=True,
+        height=540,
+        column_config={
+            "Input (noisy)": st.column_config.TextColumn(width="medium"),
+            "Predicted":     st.column_config.TextColumn(width="medium"),
+            "Expected":      st.column_config.TextColumn(width="medium"),
+            "Category":      st.column_config.TextColumn(width="medium"),
+            "Outcome":       st.column_config.TextColumn(width="small"),
+        },
+    )
+# ══════════════════════════════════════════════════════════════════════
+# TAB 3 — Performance
+# ══════════════════════════════════════════════════════════════════════
+with tab_perf:
+    df = load_data()
+    needs_change = df[df["should_change"]]
+    no_change    = df[~df["should_change"]]
+    c1, c2, c3, c4 = st.columns(4)
+    c1.metric("Total examples",      f"{len(df)}")
+    c2.metric("Overall EM",          f"{df['em'].mean():.1%}")
+    c3.metric("Fix accuracy",        f"{needs_change['em'].mean():.1%}",
+              help="Exact match on queries that SHOULD change")
+    c4.metric("No-change precision", f"{no_change['em'].mean():.1%}",
+              help="Correctly left unchanged queries that should NOT change")
+    st.markdown("---")
+    st.subheader("Per-category breakdown")
+    rows = []
+    for cat, (label, desc) in CATEGORY_INFO.items():
+        sub   = df[df["category"] == cat]
+        if len(sub) == 0:
+            continue
+        needs = sub[sub["should_change"]]
+        ok    = sub[~sub["should_change"]]
+        rows.append({
+            "Category":        label,
+            "n":               len(sub),
+            "EM %":            f"{sub['em'].mean():.0%}",
+            "Fix accuracy":    f"{needs['em'].mean():.0%}" if len(needs) else "—",
+            "No-change prec.": f"{ok['em'].mean():.0%}"    if len(ok)    else "—",
+            "Errors":          int((~sub["em"]).sum()),
+            "What it tests":   desc,
+        })
+    st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
+    st.markdown("---")
+    st.subheader("Failure cases by category")
+    st.caption("All queries where the normalizer produced a wrong output.")
+    failures = df[~df["em"]]
+    if len(failures) == 0:
+        st.success("No failures!")
+    else:
+        for cat, (label, _) in CATEGORY_INFO.items():
+            sub = failures[failures["category"] == cat]
+            if len(sub) == 0:
+                continue
+            with st.expander(f"{label} — {len(sub)} failure{'s' if len(sub) != 1 else ''}"):
+                show = sub[["noisy", "pred", "canonical", "outcome"]].copy()
+                show.columns = ["Input", "Predicted", "Expected", "Outcome"]
+                st.dataframe(show, use_container_width=True, hide_index=True)

benchmark.py ADDED Viewed

	@@ -0,0 +1,820 @@

+"""
+Query Normalization Benchmark
+==============================
+Benchmarks multiple normalization approaches on the generated dataset.
+Normalizers:
+  1. Identity              - baseline, no change
+  2. PySpellChecker        - token-by-token spell correction (current approach)
+  3. SymSpell              - faster, supports compound word correction
+  4. Rules                 - regex + entity canonicalization (flight IDs, stock tickers, product spacing)
+  5. RapidFuzz             - fuzzy brand name matching
+  6. Combined              - Rules → SymSpell → RapidFuzz pipeline
+  --- ML ---
+  7. ContextualSpellCheck  - spaCy pipeline with BERT contextual embeddings
+  8. T5SpellCorrector      - HuggingFace T5 fine-tuned for spelling correction
+  9. CombinedML            - Rules → T5 pipeline (entity rules first, T5 for the rest)
+Metrics (per normalizer, per category):
+  exact_match         - % where output == canonical (case-insensitive)
+  cer                 - character error rate: edit_dist / max(len_pred, len_gold)
+  wer                 - word error rate: token-level edit distance / n_gold_tokens
+  no_change_precision - on no_change rows: % correctly left unchanged
+  over_correction     - on no_change rows: % wrongly changed
+  latency_mean_ms     - mean per-query latency
+  latency_p50_ms      - p50 latency
+  latency_p95_ms      - p95 latency
+  latency_p99_ms      - p99 latency
+Usage:
+  pip install -r requirements.txt
+  python3 benchmark.py [--dataset dataset.csv]
+"""
+import re
+import sys
+import time
+import argparse
+import warnings
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from abc import ABC, abstractmethod
+from typing import Optional
+warnings.filterwarnings("ignore")
+# ── Optional imports ───────────────────────────────────────────────────────────
+try:
+    from Levenshtein import distance as _lev
+    def edit_distance(a: str, b: str) -> int: return _lev(a, b)
+except ImportError:
+    # Pure-python fallback
+    def edit_distance(a: str, b: str) -> int:
+        m, n = len(a), len(b)
+        dp = list(range(n + 1))
+        for i in range(1, m + 1):
+            prev = dp[:]
+            dp[0] = i
+            for j in range(1, n + 1):
+                dp[j] = prev[j - 1] if a[i-1] == b[j-1] else 1 + min(prev[j], dp[j-1], prev[j-1])
+        return dp[n]
+try:
+    from spellchecker import SpellChecker as _SC
+    HAS_PYSPELL = True
+except ImportError:
+    HAS_PYSPELL = False
+    print("Warning: pyspellchecker not installed — skipping PySpell normalizer")
+try:
+    from symspellpy import SymSpell as _SS, Verbosity as _V
+    import pkg_resources
+    HAS_SYMSPELL = True
+except ImportError:
+    HAS_SYMSPELL = False
+    print("Warning: symspellpy not installed — skipping SymSpell normalizer")
+try:
+    from rapidfuzz import process as _rf_process, fuzz as _rf_fuzz
+    HAS_RAPIDFUZZ = True
+except ImportError:
+    HAS_RAPIDFUZZ = False
+    print("Warning: rapidfuzz not installed — skipping RapidFuzz normalizer")
+try:
+    import spacy as _spacy
+    import contextualSpellCheck as _csc
+    _csc_nlp = _spacy.load("en_core_web_sm")
+    _csc.add_to_pipe(_csc_nlp)
+    HAS_CONTEXTUAL = True
+except Exception:
+    HAS_CONTEXTUAL = False
+    print("Warning: contextualSpellCheck/spacy not available — skipping ContextualSpellCheck normalizer")
+    print("  Install: pip install contextualSpellCheck && python -m spacy download en_core_web_sm")
+try:
+    from transformers import pipeline as _hf_pipeline
+    HAS_TRANSFORMERS = True
+except ImportError:
+    HAS_TRANSFORMERS = False
+    print("Warning: transformers not installed — skipping T5 normalizer")
+    print("  Install: pip install transformers torch")
+# ── Brand list for fuzzy matching ──────────────────────────────────────────────
+BRANDS = [
+    "amazon", "google", "facebook", "twitter", "instagram", "youtube",
+    "linkedin", "reddit", "netflix", "spotify", "microsoft", "adobe",
+    "dropbox", "github", "slack", "zoom", "paypal", "ebay", "walmart",
+    "target", "best buy", "new york times", "bbc", "cnn", "espn",
+    "gmail", "outlook", "yahoo", "apple", "samsung", "dell", "hp",
+    "lenovo", "asus", "acer", "toshiba", "sony", "lg", "panasonic",
+    "booking.com", "expedia", "airbnb", "tripadvisor", "yelp",
+    "doordash", "ubereats", "grubhub", "lyft", "uber",
+    "twitch", "discord", "telegram", "whatsapp", "snapchat", "tiktok",
+]
+# ── Entity lists for rules normalizer ──────────────────────────────────────────
+# Common IATA codes (2-3 letter airline codes)
+IATA_CODES = {
+    "AA", "BA", "DL", "UA", "LH", "AF", "EK", "QR", "SQ", "CX",
+    "VS", "KL", "IB", "TK", "AC", "QF", "NH", "JL", "MH", "TG",
+    "AI", "SA", "ET", "KE", "OZ", "CI", "BR", "LA", "AV", "AM",
+    "WN", "B6", "AS", "F9", "NK", "G4", "VX", "HA",
+}
+# Common stock tickers → company name aliases
+STOCK_ALIASES: dict[str, list[str]] = {
+    "AAPL": ["apple", "aapl"],
+    "TSLA": ["tesla", "tsla"],
+    "MSFT": ["microsoft", "msft"],
+    "GOOGL": ["google", "alphabet", "googl"],
+    "AMZN": ["amazon", "amzn"],
+    "META": ["meta", "facebook", "fb"],
+    "NVDA": ["nvidia", "nvda"],
+    "NFLX": ["netflix", "nflx"],
+    "PYPL": ["paypal", "pypl"],
+    "SNAP": ["snapchat", "snap"],
+    "AMD":  ["amd"],
+    "INTC": ["intel", "intc"],
+    "UBER": ["uber"],
+    "LYFT": ["lyft"],
+    "ABNB": ["airbnb", "abnb"],
+    "COIN": ["coinbase", "coin"],
+    "HOOD": ["robinhood", "hood"],
+}
+# Reverse map: alias → ticker
+_ALIAS_TO_TICKER: dict[str, str] = {}
+for ticker, aliases in STOCK_ALIASES.items():
+    for alias in aliases:
+        _ALIAS_TO_TICKER[alias.lower()] = ticker
+# Product model patterns: brand → canonical prefix
+PRODUCT_BRANDS = ["iphone", "samsung", "macbook", "ipad", "pixel", "surface"]
+# ── Base normalizer ────────────────────────────────────────────────────────────
+class Normalizer(ABC):
+    name: str
+    def warmup(self) -> None:
+        """Called once before benchmarking to initialize any lazy state."""
+        pass
+    @abstractmethod
+    def normalize(self, query: str) -> str:
+        ...
+    def normalize_batch(self, queries: list[str]) -> list[str]:
+        return [self.normalize(q) for q in queries]
+# ── 1. Identity (baseline) ────────────────────────────────────────────────────
+class IdentityNormalizer(Normalizer):
+    name = "Identity (baseline)"
+    def normalize(self, query: str) -> str:
+        return query
+# ── 2. PySpellChecker ────────────────────────────────────────────────────────
+class PySpellNormalizer(Normalizer):
+    name = "PySpellChecker"
+    def __init__(self):
+        if not HAS_PYSPELL:
+            raise RuntimeError("pyspellchecker not installed")
+        self._sc = _SC()
+    def normalize(self, query: str) -> str:
+        words = query.lower().split()
+        return " ".join(self._sc.correction(w) or w for w in words)
+# ── 3. SymSpell ───────────────────────────────────────────────────────────────
+_ORCAS_VOCAB = Path(__file__).parent / "orcas_vocab.txt"
+class SymSpellNormalizer(Normalizer):
+    name = "SymSpell"
+    def __init__(self, max_edit_distance: int = 2):
+        if not HAS_SYMSPELL:
+            raise RuntimeError("symspellpy not installed")
+        self._sym = _SS(max_dictionary_edit_distance=max_edit_distance)
+        # Try importlib.resources first (works in newer Python/packaging setups),
+        # fall back to pkg_resources for older environments.
+        _dict_loaded = False
+        # Try candidate dictionary filenames (name changed across symspellpy versions)
+        _DICT_CANDIDATES = ["frequency_dictionary_en_82_765.txt", "en-80k.txt"]
+        try:
+            import importlib.resources as _ir
+            for _fname in _DICT_CANDIDATES:
+                try:
+                    _ref = _ir.files("symspellpy").joinpath(_fname)
+                    with _ir.as_file(_ref) as _dp:
+                        _dict_loaded = self._sym.load_dictionary(str(_dp), term_index=0, count_index=1)
+                    if _dict_loaded:
+                        break
+                except Exception:
+                    pass
+        except Exception:
+            pass
+        if not _dict_loaded:
+            for _fname in _DICT_CANDIDATES:
+                _dp = pkg_resources.resource_filename("symspellpy", _fname)
+                _dict_loaded = self._sym.load_dictionary(_dp, term_index=0, count_index=1)
+                if _dict_loaded:
+                    break
+        if _ORCAS_VOCAB.exists():
+            self._sym.load_dictionary(str(_ORCAS_VOCAB), term_index=0, count_index=1)
+            self.name = "SymSpell+ORCAS"
+        self._max_ed = max_edit_distance
+    def normalize(self, query: str) -> str:
+        # Use lookup_compound for multi-token correction
+        suggestions = self._sym.lookup_compound(
+            query.lower(), max_edit_distance=self._max_ed
+        )
+        if suggestions:
+            return suggestions[0].term
+        return query.lower()
+# ── 4. Rules (entity + regex) ────────────────────────────────────────────────
+class RulesNormalizer(Normalizer):
+    name = "Rules (entity + regex)"
+    # Flight: digits + IATA  or  IATA + digits  →  IATA + digits (no space)
+    _FLIGHT_LOOSE = re.compile(
+        r'\b(?:flight\s+)?(\d{2,4})\s*([A-Z]{2,3})\b'   # 163 SQ
+        r'|'
+        r'\b(?:flight\s+)?([A-Z]{2,3})\s+(\d{2,4})\b',  # SQ 163  (space)
+        re.IGNORECASE
+    )
+    # Product spacing: brand directly followed by digits/variant ("iphone15")
+    _PRODUCT_SPACING = re.compile(
+        r'\b(iphone|macbook|ipad|pixel|galaxy|surface|airpods)'
+        r'(\d+|pro|air|mini|max|ultra|plus)\b',
+        re.IGNORECASE
+    )
+    # Stock: remove surrounding noise, keep just the ticker
+    _STOCK_NOISE = re.compile(
+        r'\b(stock|share|price|shares|equity|ticker|market|trading|invest(?:ment)?)\b',
+        re.IGNORECASE
+    )
+    def _normalize_flight(self, query: str) -> str:
+        q_upper = query.upper()
+        def _repl(m):
+            if m.group(1):   # digits IATA
+                num, code = m.group(1), m.group(2).upper()
+            else:            # IATA digits
+                code, num = m.group(3).upper(), m.group(4)
+            if code in IATA_CODES:
+                return f"{code}{num}"
+            return m.group(0)
+        result = self._FLIGHT_LOOSE.sub(_repl, query)
+        return result
+    def _normalize_stock(self, query: str) -> Optional[str]:
+        ql = query.lower().strip()
+        tokens = ql.split()
+        # Check if any token is a known ticker or alias
+        found_ticker = None
+        for tok in tokens:
+            # Direct ticker match (uppercase)
+            if tok.upper() in STOCK_ALIASES:
+                found_ticker = tok.upper()
+                break
+            # Alias match
+            if tok in _ALIAS_TO_TICKER:
+                found_ticker = _ALIAS_TO_TICKER[tok]
+        if found_ticker:
+            # Case 1: stock noise words present (e.g. "AAPL stock price")
+            remaining = self._STOCK_NOISE.sub("", ql).strip()
+            if remaining != ql.strip():
+                return found_ticker
+            # Case 2: explicit ticker token present alongside alias
+            # (e.g. "apple aapl", "google GOOGL") — but NOT "google pixel 8"
+            if found_ticker.lower() in tokens:
+                return found_ticker
+        return None
+    def _normalize_product_spacing(self, query: str) -> str:
+        return self._PRODUCT_SPACING.sub(lambda m: f"{m.group(1)} {m.group(2)}", query)
+    def _normalize_word_order(self, query: str) -> str:
+        """Reorder product queries so the brand/product-line token comes first.
+        Handles patterns like:
+          's24 samsung'      → 'samsung s24'
+          'pro 14 macbook'   → 'macbook pro 14'
+          'ultra s23 samsung'→ 'samsung ultra s23'
+        """
+        tokens = query.lower().split()
+        if len(tokens) < 2:
+            return query
+        # Find a PRODUCT_BRANDS token that is not already at position 0
+        for i, tok in enumerate(tokens):
+            if i > 0 and tok in PRODUCT_BRANDS:
+                # Move brand to front, preserve relative order of the rest
+                return " ".join([tok] + tokens[:i] + tokens[i + 1:])
+        return query
+    def normalize(self, query: str) -> str:
+        q = query.strip()
+        # 1. Stock canonicalization
+        stock = self._normalize_stock(q)
+        if stock:
+            return stock
+        # 2. Flight ID normalization
+        q = self._normalize_flight(q)
+        # 3. Product spacing
+        q = self._normalize_product_spacing(q)
+        # 4. Product word order
+        q = self._normalize_word_order(q)
+        # 5. Clean up extra whitespace
+        q = re.sub(r'\s+', ' ', q).strip()
+        return q
+# ── 5. RapidFuzz (brand matching) ────────────────────────────────────────────
+class RapidFuzzNormalizer(Normalizer):
+    name = "RapidFuzz (brand match)"
+    def __init__(self, score_cutoff: int = 82):
+        if not HAS_RAPIDFUZZ:
+            raise RuntimeError("rapidfuzz not installed")
+        self._cutoff = score_cutoff
+    def normalize(self, query: str) -> str:
+        ql = query.lower().strip()
+        # Only attempt brand correction on short queries (≤ 3 tokens)
+        tokens = ql.split()
+        if len(tokens) > 3:
+            return query
+        # Skip very short queries — too ambiguous to fuzzy-match safely
+        # (e.g. 'appl', 'npm', 'gcc' should not be matched to brand names)
+        if len(ql) <= 5:
+            return query
+        # Try matching each n-gram of the query against the brand list
+        # First try the full query, then try progressively smaller windows
+        result = _rf_process.extractOne(
+            ql, BRANDS,
+            scorer=_rf_fuzz.token_sort_ratio,
+            score_cutoff=self._cutoff,
+        )
+        if result:
+            best_match, score, _ = result
+            return best_match
+        return query
+# ── 6. Combined ───────────────────────────────────────────────────────────────
+class CombinedNormalizer(Normalizer):
+    name = "Combined (Rules + SymSpell + RapidFuzz)"
+    def __init__(self):
+        self._rules = RulesNormalizer()
+        self._symspell = SymSpellNormalizer() if HAS_SYMSPELL else None
+        self._rfuzz   = RapidFuzzNormalizer() if HAS_RAPIDFUZZ else None
+    def normalize(self, query: str) -> str:
+        q = query.strip()
+        # Step 1: Apply entity/structural rules first (highest precision)
+        q_rules = self._rules.normalize(q)
+        if q_rules.lower() != q.lower():
+            return q_rules  # Rules made a change — trust it
+        # Step 2: SymSpell for general typo correction
+        if self._symspell:
+            q_sym = self._symspell.normalize(q)
+            if q_sym.lower() != q.lower():
+                return q_sym
+        # Step 3: RapidFuzz for brand name typos (catches what SymSpell misses
+        # on compound brand names like "bestbuyt" → "best buy")
+        if self._rfuzz:
+            q_rf = self._rfuzz.normalize(q)
+            if q_rf.lower() != q.lower():
+                return q_rf
+        return q
+# ── 7. GuardedPySpell ────────────────────────────────────────────────────────
+class GuardedPySpellNormalizer(Normalizer):
+    """PySpellChecker with guards to prevent over-correction.
+    PySpellChecker gets 88% on single_typo and 71% on multi_typo, but has
+    40% over-correction on no-change queries (e.g. 'appl' → 'apple').
+    Guards:
+      - Skip tokens ≤ 4 chars  (appl, npm, gcc, css, java, rust, echo, go)
+      - Skip all-uppercase tokens  (AAPL, NYC, SQ — abbreviations/tickers)
+    Most legitimate short abbreviations are ≤ 4 chars or all-caps.
+    Typos worth correcting are almost always ≥ 5 chars ('wheather', 'suhsi').
+    """
+    name = "PySpell (guarded)"
+    def __init__(self):
+        if not HAS_PYSPELL:
+            raise RuntimeError("pyspellchecker not installed")
+        self._sc = _SC()
+    def _skip(self, token: str) -> bool:
+        return len(token) <= 4 or token.isupper()
+    def normalize(self, query: str) -> str:
+        words = query.lower().split()
+        return " ".join(
+            w if self._skip(w) else (self._sc.correction(w) or w)
+            for w in words
+        )
+# ── 8. CombinedV2 (Rules + GuardedPySpell + RapidFuzz) ───────────────────────
+class CombinedV2Normalizer(Normalizer):
+    """Improved pipeline: Rules → RapidFuzz (single-token) → SymSpell split → GuardedPySpell → RapidFuzz (multi-token).
+    Rules handles structured entities (flight IDs, stock tickers, product
+    spacing/order) with perfect precision. RapidFuzz runs first on single-token
+    queries to catch brand typos (bestbuyt→best buy) before SymSpell can corrupt
+    them (bestbuyt→best but). SymSpell compound splitting then handles concatenated
+    words (nearme→near me). GuardedPySpell handles general typos while protecting
+    short tokens. RapidFuzz runs again at the end for multi-token brand typos.
+    """
+    name = "CombinedV2 (Rules + GuardedPySpell + RapidFuzz)"
+    def __init__(self):
+        self._rules    = RulesNormalizer()
+        self._symspell = SymSpellNormalizer() if HAS_SYMSPELL else None
+        self._pyspell  = GuardedPySpellNormalizer() if HAS_PYSPELL else None
+        self._rfuzz    = RapidFuzzNormalizer() if HAS_RAPIDFUZZ else None
+    def normalize(self, query: str) -> str:
+        q = query.strip()
+        # Step 1: Rules — flight IDs, stock tickers, product spacing/order
+        q_rules = self._rules.normalize(q)
+        if q_rules.lower() != q.lower():
+            return q_rules
+        # Step 2: RapidFuzz — brand name typos for single-token queries.
+        # Must run before SymSpell compound splitting: SymSpell splits 'bestbuyt'
+        # into 'best but' (wrong) whereas RapidFuzz correctly maps it to 'best buy'.
+        if self._rfuzz and ' ' not in q:
+            q_rf = self._rfuzz.normalize(q)
+            if q_rf.lower() != q.lower():
+                return q_rf
+        # Step 3: SymSpell compound splitting for single-token queries only.
+        # GuardedPySpell would corrupt 'nearme'→'name', 'newyork'→'network'.
+        # Only accept the SymSpell result if it actually introduces a space
+        # (i.e. it split the word rather than substituting a different word).
+        if self._symspell and ' ' not in q:
+            q_sym = self._symspell.normalize(q)
+            if ' ' in q_sym:
+                return q_sym
+        # Step 4: GuardedPySpell — general typos (skips short/uppercase tokens)
+        if self._pyspell:
+            q_spell = self._pyspell.normalize(q)
+            if q_spell.lower() != q.lower():
+                return q_spell
+        # Step 5: RapidFuzz — brand name typos for multi-token queries
+        # (e.g. 'gooogle maps' → 'google maps', 'spotifiy premium' → 'spotify premium')
+        if self._rfuzz:
+            q_rf = self._rfuzz.normalize(q)
+            if q_rf.lower() != q.lower():
+                return q_rf
+        return q
+# ── 9. ContextualSpellCheck (spaCy + BERT) ───────────────────────────────────
+class ContextualSpellCheckNormalizer(Normalizer):
+    """Uses BERT contextual embeddings to decide whether and how to correct
+    each token. Unlike SymSpell, it sees the full query context before
+    making a correction — so 'appl' in an ambiguous context stays as-is,
+    while 'wheather nyc' correctly becomes 'weather nyc'.
+    Requires:
+      pip install contextualSpellCheck
+      python -m spacy download en_core_web_sm
+    """
+    name = "ContextualSpellCheck (BERT)"
+    def __init__(self):
+        if not HAS_CONTEXTUAL:
+            raise RuntimeError("contextualSpellCheck not available")
+        self._nlp = _csc_nlp
+    def normalize(self, query: str) -> str:
+        doc = self._nlp(query)
+        # doc._.outcome_spellCheck is the full corrected string
+        result = doc._.outcome_spellCheck
+        return result if result else query
+# ── 8. T5 Spell Corrector (HuggingFace) ──────────────────────────────────────
+class T5SpellCorrector(Normalizer):
+    """Fine-tuned T5 model for spelling correction.
+    Model: oliverguhr/spelling-correction-english-base
+    This is a seq2seq model trained on noisy→clean sentence pairs.
+    It handles multi-token typos, word order, and spacing better than
+    dictionary-based approaches, but at significantly higher latency.
+    Expected latency: ~100–500ms on CPU, ~20–80ms on GPU.
+    Requires:
+      pip install transformers torch (or transformers sentencepiece)
+    """
+    name = "T5 (oliverguhr/spelling-correction)"
+    _MODEL_ID = "oliverguhr/spelling-correction-english-base"
+    def __init__(self):
+        if not HAS_TRANSFORMERS:
+            raise RuntimeError("transformers not installed")
+        self._pipe = None  # lazy load in warmup()
+    def warmup(self) -> None:
+        print(f"    Loading {self._MODEL_ID}...", end=" ", flush=True)
+        self._pipe = _hf_pipeline(
+            "text2text-generation",
+            model=self._MODEL_ID,
+            tokenizer=self._MODEL_ID,
+        )
+        # Prime the model with a dummy query
+        self._pipe("warmup query", max_length=64)
+        print("ready")
+    def normalize(self, query: str) -> str:
+        if self._pipe is None:
+            self.warmup()
+        result = self._pipe(query, max_length=128, num_beams=4)
+        return result[0]["generated_text"].strip()
+# ── 9. CombinedML (Rules → T5) ───────────────────────────────────────────────
+class CombinedMLNormalizer(Normalizer):
+    """Best-of-both-worlds pipeline:
+      1. Rules handle structured entity normalization (flight IDs, stock tickers,
+         product model reordering) with zero latency and perfect precision.
+      2. T5 handles everything else — general typos, multi-token corrections,
+         brand names — using full-query context.
+    This avoids running T5 on queries that rules already handle perfectly,
+    saving latency on the most common structured patterns.
+    """
+    name = "CombinedML (Rules → T5)"
+    def __init__(self):
+        self._rules = RulesNormalizer()
+        self._t5    = T5SpellCorrector() if HAS_TRANSFORMERS else None
+    def warmup(self) -> None:
+        if self._t5:
+            self._t5.warmup()
+    def normalize(self, query: str) -> str:
+        # Step 1: Rules first — highest precision for structured entities
+        q_rules = self._rules.normalize(query)
+        if q_rules.lower() != query.lower():
+            return q_rules
+        # Step 2: T5 for everything else
+        if self._t5:
+            return self._t5.normalize(query)
+        return query
+# ── Metrics ───────────────────────────────────────────────────────────────────
+def char_error_rate(pred: str, gold: str) -> float:
+    """CER = edit_distance / max(len(pred), len(gold))."""
+    if not pred and not gold:
+        return 0.0
+    return edit_distance(pred.lower(), gold.lower()) / max(len(pred), len(gold))
+def word_error_rate(pred: str, gold: str) -> float:
+    """WER = token-level edit distance / number of gold tokens."""
+    pred_toks = pred.lower().split()
+    gold_toks = gold.lower().split()
+    if not gold_toks:
+        return 0.0
+    m, n = len(pred_toks), len(gold_toks)
+    dp = list(range(n + 1))
+    for i in range(1, m + 1):
+        prev = dp[:]
+        dp[0] = i
+        for j in range(1, n + 1):
+            dp[j] = prev[j-1] if pred_toks[i-1] == gold_toks[j-1] \
+                    else 1 + min(prev[j], dp[j-1], prev[j-1])
+    return dp[n] / n
+def run_benchmark(normalizer: Normalizer, df: pd.DataFrame, n_timing_reps: int = 5) -> dict:
+    """Run a normalizer on the dataset and return metrics."""
+    queries = df["noisy"].tolist()
+    # ── Timing ───────────────────────────────────────────────────────────────
+    latencies_ms = []
+    for q in queries:
+        t0 = time.perf_counter()
+        for _ in range(n_timing_reps):
+            normalizer.normalize(q)
+        t1 = time.perf_counter()
+        latencies_ms.append((t1 - t0) / n_timing_reps * 1000)
+    # ── Predictions ──────────────────────────────────────────────────────────
+    preds = [normalizer.normalize(q) for q in queries]
+    df = df.copy()
+    df["pred"] = preds
+    def em(row): return row["pred"].lower().strip() == row["canonical"].lower().strip()
+    def cer(row): return char_error_rate(row["pred"], row["canonical"])
+    def wer(row): return word_error_rate(row["pred"], row["canonical"])
+    df["em"]  = df.apply(em, axis=1)
+    df["cer"] = df.apply(cer, axis=1)
+    df["wer"] = df.apply(wer, axis=1)
+    # No-change precision and over-correction rate
+    nc = df[~df["should_change"]]
+    no_change_precision = (nc["pred"].str.lower().str.strip() == nc["noisy"].str.lower().str.strip()).mean() if len(nc) else float("nan")
+    over_correction     = 1.0 - no_change_precision if not np.isnan(no_change_precision) else float("nan")
+    # ── Per-category exact match ──────────────────────────────────────────────
+    cat_em = df.groupby("category")["em"].mean().to_dict()
+    return {
+        "name":                normalizer.name,
+        "exact_match":         df["em"].mean(),
+        "cer_mean":            df["cer"].mean(),
+        "wer_mean":            df["wer"].mean(),
+        "no_change_precision": no_change_precision,
+        "over_correction":     over_correction,
+        "latency_mean_ms":     np.mean(latencies_ms),
+        "latency_p50_ms":      np.percentile(latencies_ms, 50),
+        "latency_p95_ms":      np.percentile(latencies_ms, 95),
+        "latency_p99_ms":      np.percentile(latencies_ms, 99),
+        "per_category":        cat_em,
+        "_df":                 df,      # store for detailed output
+        "_latencies":          latencies_ms,
+    }
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", default=str(Path(__file__).parent / "dataset.csv"))
+    parser.add_argument("--reps",    type=int, default=5, help="Timing repetitions per query")
+    args = parser.parse_args()
+    df = pd.read_csv(args.dataset)
+    print(f"Loaded {len(df)} rows from {args.dataset}")
+    print(f"Categories: {df['category'].value_counts().to_dict()}\n")
+    # ── Build normalizer list ─────────────────────────────────────────────────
+    normalizers: list[Normalizer] = [IdentityNormalizer(), RulesNormalizer()]
+    if HAS_PYSPELL:
+        normalizers.append(PySpellNormalizer())
+    if HAS_SYMSPELL:
+        normalizers.append(SymSpellNormalizer())
+    if HAS_RAPIDFUZZ:
+        normalizers.append(RapidFuzzNormalizer())
+    if HAS_SYMSPELL and HAS_RAPIDFUZZ:
+        normalizers.append(CombinedNormalizer())
+    if HAS_PYSPELL:
+        normalizers.append(GuardedPySpellNormalizer())
+    if HAS_PYSPELL and HAS_RAPIDFUZZ:
+        normalizers.append(CombinedV2Normalizer())
+    # ML normalizers (disabled — too slow and underperform rules-based)
+    # if HAS_CONTEXTUAL:
+    #     normalizers.append(ContextualSpellCheckNormalizer())
+    # if HAS_TRANSFORMERS:
+    #     normalizers.append(T5SpellCorrector())
+    #     normalizers.append(CombinedMLNormalizer())
+    # Warmup
+    for norm in normalizers:
+        norm.warmup()
+    # ── Run benchmarks ────────────────────────────────────────────────────────
+    results = []
+    for norm in normalizers:
+        print(f"Benchmarking: {norm.name}...", end=" ", flush=True)
+        r = run_benchmark(norm, df, n_timing_reps=args.reps)
+        results.append(r)
+        print(f"EM={r['exact_match']:.1%}  CER={r['cer_mean']:.3f}  lat_p50={r['latency_p50_ms']:.2f}ms")
+    # ── Summary table ─────────────────────────────────────────────────────────
+    print("\n" + "="*90)
+    print("SUMMARY — Overall Metrics")
+    print("="*90)
+    summary_rows = []
+    for r in results:
+        summary_rows.append({
+            "Normalizer":         r["name"],
+            "Exact Match":        f"{r['exact_match']:.1%}",
+            "CER":                f"{r['cer_mean']:.3f}",
+            "WER":                f"{r['wer_mean']:.3f}",
+            "No-change Prec.":    f"{r['no_change_precision']:.1%}" if not np.isnan(r['no_change_precision']) else "N/A",
+            "Over-correction":    f"{r['over_correction']:.1%}"     if not np.isnan(r['over_correction'])     else "N/A",
+            "Lat mean (ms)":      f"{r['latency_mean_ms']:.2f}",
+            "Lat p50 (ms)":       f"{r['latency_p50_ms']:.2f}",
+            "Lat p95 (ms)":       f"{r['latency_p95_ms']:.2f}",
+            "Lat p99 (ms)":       f"{r['latency_p99_ms']:.2f}",
+        })
+    try:
+        from tabulate import tabulate
+        print(tabulate(summary_rows, headers="keys", tablefmt="rounded_outline"))
+    except ImportError:
+        pd.DataFrame(summary_rows).to_string(index=False)
+        print(pd.DataFrame(summary_rows).to_string(index=False))
+    # ── Per-category table ────────────────────────────────────────────────────
+    categories = sorted(df["category"].unique())
+    print("\n" + "="*90)
+    print("PER-CATEGORY Exact Match")
+    print("="*90)
+    cat_rows = []
+    for r in results:
+        row = {"Normalizer": r["name"][:30]}
+        for cat in categories:
+            row[cat] = f"{r['per_category'].get(cat, float('nan')):.0%}"
+        cat_rows.append(row)
+    try:
+        from tabulate import tabulate
+        print(tabulate(cat_rows, headers="keys", tablefmt="rounded_outline"))
+    except ImportError:
+        print(pd.DataFrame(cat_rows).to_string(index=False))
+    # ── Sample predictions ────────────────────────────────────────────────────
+    print("\n" + "="*90)
+    print("SAMPLE PREDICTIONS — Combined vs Identity (first 5 per category)")
+    print("="*90)
+    combined_r = next((r for r in results if "CombinedV2" in r["name"]),
+                      next((r for r in results if "Combined" in r["name"]), results[-1]))
+    identity_r = results[0]
+    for cat in categories:
+        sub   = combined_r["_df"][combined_r["_df"]["category"] == cat].head(5)
+        id_sub = identity_r["_df"][identity_r["_df"]["category"] == cat].head(5)
+        print(f"\n  {cat.upper()}")
+        print(f"  {'Noisy':<30} {'Canonical':<25} {'Combined pred':<25} {'EM':>4}")
+        print(f"  {'-'*30} {'-'*25} {'-'*25} {'-'*4}")
+        for (_, row), (_, id_row) in zip(sub.iterrows(), id_sub.iterrows()):
+            em_mark = "✓" if row["em"] else "✗"
+            print(f"  {row['noisy']:<30} {row['canonical']:<25} {row['pred']:<25} {em_mark:>4}")
+    # ── Save full results ─────────────────────────────────────────────────────
+    out_path = Path(args.dataset).parent / "results.csv"
+    combined_r["_df"].to_csv(out_path, index=False)
+    print(f"\nFull predictions saved to {out_path}")
+if __name__ == "__main__":
+    main()

dataset.csv ADDED Viewed

	@@ -0,0 +1,300 @@

+noisy,canonical,category,should_change
+wheather nyc,weather nyc,single_typo,True
+calclator,calculator,single_typo,True
+forcast london,forecast london,single_typo,True
+temprature converter,temperature converter,single_typo,True
+restaurent near me,restaurant near me,single_typo,True
+translater english,translator english,single_typo,True
+defintion of entropy,definition of entropy,single_typo,True
+seperate the words,separate the words,single_typo,True
+accomodation paris,accommodation paris,single_typo,True
+recieve email,receive email,single_typo,True
+suhsi near me,sushi near me,multi_typo,True
+restarant near me,restaurant near me,multi_typo,True
+wether forcast today,weather forecast today,multi_typo,True
+plmber emergancy,plumber emergency,multi_typo,True
+nearist cofee shop,nearest coffee shop,multi_typo,True
+cheep flihgts paris,cheap flights paris,multi_typo,True
+hotl delas nyc,hotel deals nyc,multi_typo,True
+hosptial emergancy rm,hospital emergency room,multi_typo,True
+bestbuyt,best buy,brand_typo,True
+youtueb,youtube,brand_typo,True
+gooogle maps,google maps,brand_typo,True
+amazom prime,amazon prime,brand_typo,True
+netflx login,netflix login,brand_typo,True
+spotifiy premium,spotify premium,brand_typo,True
+facbook login,facebook login,brand_typo,True
+instagrem,instagram,brand_typo,True
+linkdin profile,linkedin profile,brand_typo,True
+gitub repo,github repo,brand_typo,True
+163 SQ,SQ163,flight_order,True
+100 AA,AA100,flight_order,True
+417 BA,BA417,flight_order,True
+SQ 163,SQ163,flight_order,True
+AA 100,AA100,flight_order,True
+815 DL,DL815,flight_order,True
+200 UA,UA200,flight_order,True
+flight 163 SQ,SQ163,flight_order,True
+AA flight 100,AA100,flight_order,True
+15 iphone,iphone 15,product_order,True
+pro 14 macbook,macbook pro 14,product_order,True
+s24 samsung,samsung s24,product_order,True
+ultra s23 samsung,samsung s23 ultra,product_order,True
+air 13 macbook,macbook air 13,product_order,True
+pro ipad 12,ipad pro 12,product_order,True
+max pro 15 iphone,iphone 15 pro max,product_order,True
+pixel 8 google,google pixel 8,product_order,True
+tab s9 samsung,samsung tab s9,product_order,True
+AAPL stock,AAPL,stock_canon,True
+stock TSLA,TSLA,stock_canon,True
+apple aapl,AAPL,stock_canon,True
+tesla stock price,TSLA,stock_canon,True
+MSFT share price,MSFT,stock_canon,True
+google stock GOOGL,GOOGL,stock_canon,True
+amazon AMZN stock,AMZN,stock_canon,True
+meta stock FB,META,stock_canon,True
+nvda share price,NVDA,stock_canon,True
+iphone15,iphone 15,spacing,True
+macbookpro,macbook pro,spacing,True
+nearme,near me,spacing,True
+bestbuy,best buy,spacing,True
+newyork,new york,spacing,True
+unitedstates,united states,spacing,True
+wifi password,wifi password,spacing,False
+hotdog,hotdog,spacing,False
+appl,appl,no_change,False
+rust,rust,no_change,False
+delta,delta,no_change,False
+apple,apple,no_change,False
+python,python,no_change,False
+java,java,no_change,False
+echo,echo,no_change,False
+spring,spring,no_change,False
+cloud,cloud,no_change,False
+mercury,mercury,no_change,False
+npm,npm,no_change,False
+gcc,gcc,no_change,False
+css,css,no_change,False
+go,go,no_change,False
+swift,swift,no_change,False
+waether forecast tomorrow,weather forecast tomorrow,single_typo,True
+best pizzeria near me,best pizzeria near me,single_typo,False
+how to cook pasta,how to cook pasta,single_typo,False
+gas stations nearby,gas stations nearby,single_typo,False
+resturant reservations online,restaurant reservations online,single_typo,True
+puplic libraries near me,public libraries near me,single_typo,True
+best plumber in my area,best plumber in my area,single_typo,False
+forcast weekend weather,forecast weekend weather,single_typo,True
+how to pronounce worcester,how to pronounce worcester,single_typo,False
+recipie for chocolate cake,recipe for chocolate cake,single_typo,True
+hardware store locator,hardware store locator,single_typo,False
+trafic conditions now,traffic conditions now,single_typo,True
+vacuum cleaner reviews,vacuum cleaner reviews,single_typo,False
+how to spell occassion,how to spell occasion,single_typo,True
+dentist appointements available,dentist appointments available,single_typo,True
+nearest pharmacy open now,nearest pharmacy open now,single_typo,False
+beginner gardening tips,beginner gardening tips,single_typo,False
+calorie counter app,calorie counter app,single_typo,False
+what is the defintion of serendipity,what is the definition of serendipity,single_typo,True
+best electrician in town,best electrician in town,single_typo,False
+humidty levels today,humidity levels today,single_typo,True
+directions to airport,directions to airport,single_typo,False
+how to spel definitely,how to spell definitely,single_typo,True
+grocery stores near me,grocery stores near me,single_typo,False
+buisness hours for target,business hours for target,single_typo,True
+weather in chicago tommorow,weather in chicago tomorrow,single_typo,True
+how to make omlette,how to make omelette,single_typo,True
+atm machine locations,atm machine locations,single_typo,False
+barber shop avaialble,barber shop available,single_typo,True
+best restauarant in boston,best restaurant in boston,single_typo,True
+how to cook brocoli,how to cook broccoli,single_typo,True
+swimming pools near me,swimming pools near me,single_typo,False
+seperate the documents,separate the documents,single_typo,True
+temperture in fahrenheit,temperature in fahrenheit,single_typo,True
+parking garage nearby,parking garage nearby,single_typo,False
+how to make lasagna recepie,how to make lasagna recipe,single_typo,True
+veterinary clinic hours,veterinary clinic hours,single_typo,False
+what does recieve mean,what does receive mean,single_typo,True
+yoga classes availible now,yoga classes available now,single_typo,True
+begininng spanish lessons,beginning spanish lessons,single_typo,True
+plumber near me emergancy,plumber near me emergency,multi_typo,True
+dentist appoitment availble,dentist appointment available,multi_typo,True
+electritian repaire servises,electrician repair services,multi_typo,True
+weather forcast this weakend,weather forecast this weekend,multi_typo,True
+neerest gass station,nearest gas station,multi_typo,True
+hotel reservation cheep rates,hotel reservation cheap rates,multi_typo,True
+autombile mechanic lokation,automobile mechanic location,multi_typo,True
+humidty forcast tomorow,humidity forecast tomorrow,multi_typo,True
+locksmith emergancy servise,locksmith emergency service,multi_typo,True
+flight ticket prices comparision,flight ticket prices comparison,multi_typo,True
+restauant reservaton opem,restaurant reservation open,multi_typo,True
+carpentor contrator estimat,carpenter contractor estimate,multi_typo,True
+tempreture alert wheather,temperature alert weather,multi_typo,True
+tourist atraction guidebook,tourist attraction guidebook,multi_typo,True
+laundry servise neerby,laundry service nearby,multi_typo,True
+vehcile registation renewel,vehicle registration renewal,multi_typo,True
+snowstrom warning forcast,snowstorm warning forecast,multi_typo,True
+hostotel accomodation deals,hostel accommodation deals,multi_typo,True
+haircut appoitment schedul,haircut appointment schedule,multi_typo,True
+sunrise time locaton,sunrise time location,multi_typo,True
+moving compeny quoate,moving company quote,multi_typo,True
+road conidtion trafic update,road condition traffic update,multi_typo,True
+veterinarian emergancy clinc,veterinarian emergency clinic,multi_typo,True
+vacation packge discunt availble,vacation package discount available,multi_typo,True
+pest contral servise lokation,pest control service location,multi_typo,True
+pollin forcast alergy,pollen forecast allergy,multi_typo,True
+airbnb accomodaton recomendation,airbnb accommodation recommendation,multi_typo,True
+window cleening compny rates,window cleaning company rates,multi_typo,True
+wind gust wheather alert,wind gust weather alert,multi_typo,True
+rentral car comparision price,rental car comparison price,multi_typo,True
+goggle.com,google.com,brand_typo,True
+amazn.com,amazon.com,brand_typo,True
+spotfiy music,spotify music,brand_typo,True
+instgram app,instagram app,brand_typo,True
+gitub profile,github profile,brand_typo,True
+redditt.com,reddit.com,brand_typo,True
+twiter feed,twitter feed,brand_typo,True
+linkdin jobs,linkedin jobs,brand_typo,True
+microsodt office,microsoft office,brand_typo,True
+adoobe creative,adobe creative,brand_typo,True
+dropbx files,dropbox files,brand_typo,True
+zom meeting,zoom meeting,brand_typo,True
+slck workspace,slack workspace,brand_typo,True
+paypa checkout,paypal checkout,brand_typo,True
+ebya auction,ebay auction,brand_typo,True
+wallmart groceries,walmart groceries,brand_typo,True
+targat deals,target deals,brand_typo,True
+nytimez news,nytimes news,brand_typo,True
+bbc.co.uk,bbc.com,brand_typo,True
+cnn breaking,cnn breaking news,brand_typo,True
+youtub video,youtube video,brand_typo,True
+netflic series,netflix series,brand_typo,True
+googl drive,google drive,brand_typo,True
+amzon shopping,amazon shopping,brand_typo,True
+spotiffy playlist,spotify playlist,brand_typo,True
+facebk messenger,facebook messenger,brand_typo,True
+insta stories,instagram stories,brand_typo,True
+gihub code,github code,brand_typo,True
+reddot forum,reddit forum,brand_typo,True
+BA 287,BA287,flight_order,True
+502 DL,DL502,flight_order,True
+flight UA 441,UA441,flight_order,True
+lh 156,LH156,flight_order,True
+273 AF,AF273,flight_order,True
+EK  89,EK89,flight_order,True
+621qr,QR621,flight_order,True
+CX 884,CX884,flight_order,True
+345 vs,VS345,flight_order,True
+KL 714,KL714,flight_order,True
+193ib,IB193,flight_order,True
+TK 427,TK427,flight_order,True
+flight 556 AA,AA556,flight_order,True
+738 ba,BA738,flight_order,True
+DL 212,DL212,flight_order,True
+84ua,UA84,flight_order,True
+AF  609,AF609,flight_order,True
+445 ek,EK445,flight_order,True
+sq 267,SQ267,flight_order,True
+572 CX,CX572,flight_order,True
+flight vs314,VS314,flight_order,True
+981 KL,KL981,flight_order,True
+IB   456,IB456,flight_order,True
+tk 103,TK103,flight_order,True
+890 SQ,SQ890,flight_order,True
+13 air macbook,macbook air 13,product_order,True
+pro iphone 15,iphone 15 pro,product_order,True
+8 pixel google,google pixel 8,product_order,True
+ultra 24 s23 samsung,samsung s23 ultra,product_order,True
+fold 5 samsung galaxy,samsung galaxy fold 5,product_order,True
+max 14 pro iphone,iphone 14 pro max,product_order,True
+16 macbook pro,macbook pro 16,product_order,True
+pixel pro 7 google,google pixel 7 pro,product_order,True
+tab s9 samsung galaxy,samsung galaxy tab s9,product_order,True
+12 mini iphone,iphone 12 mini,product_order,True
+z fold 4 samsung,samsung galaxy z fold 4,product_order,True
+watch series 9 apple,apple watch series 9,product_order,True
+xl pixel 8 google,google pixel 8 xl,product_order,True
+s24 ultra samsung,samsung s24 ultra,product_order,True
+15 macbook air,macbook air 15,product_order,True
+iphone pro 13,iphone 13 pro,product_order,True
+flip 5 z samsung,samsung galaxy z flip 5,product_order,True
+7 series watch apple,apple watch series 7,product_order,True
+a15 oneplus,oneplus a15,product_order,True
+pad air 11 ipad,ipad air 11,product_order,True
+ultra 15 iphone pro,iphone 15 pro max,product_order,True
+note 24 galaxy samsung,samsung galaxy note 24,product_order,True
+11 pro max iphone,iphone 11 pro max,product_order,True
+studio display apple,apple studio display,product_order,True
+x1 carbon lenovo thinkpad,lenovo thinkpad x1 carbon,product_order,True
+AAPL stock price,AAPL,stock_canon,True
+tesla share price,TSLA,stock_canon,True
+MSFT earnings,MSFT,stock_canon,True
+google GOOGL stock,GOOGL,stock_canon,True
+AMZN share,AMZN,stock_canon,True
+amazon price AMZN,AMZN,stock_canon,True
+META stock price,META,stock_canon,True
+nvidia NVDA,NVDA,stock_canon,True
+NFLX share price,NFLX,stock_canon,True
+netflix stock NFLX,NFLX,stock_canon,True
+PYPL price,PYPL,stock_canon,True
+paypal PYPL stock,PYPL,stock_canon,True
+SNAP stock,SNAP,stock_canon,True
+snapchat SNAP,SNAP,stock_canon,True
+AMD share price,AMD,stock_canon,True
+amd processor stock,AMD,stock_canon,True
+INTC earnings,INTC,stock_canon,True
+intel INTC stock,INTC,stock_canon,True
+QCOM price,QCOM,stock_canon,True
+qualcomm QCOM,QCOM,stock_canon,True
+UBER stock price,UBER,stock_canon,True
+lyft LYFT share,LYFT,stock_canon,True
+airbnb ABNB stock,ABNB,stock_canon,True
+iphone15pro,iphone 15 pro,spacing,True
+samsungz9,samsung z9,spacing,True
+ipadair,ipad air,spacing,True
+losangeles,los angeles,spacing,True
+sanfrancisco,san francisco,spacing,True
+nearbyshops,nearby shops,spacing,True
+dellxps13,dell xps 13,spacing,True
+surfacelaptopp5,surface laptop p5,spacing,True
+newyorkpizza,new york pizza,spacing,True
+holmescompany,holmes company,spacing,True
+findme,find me,spacing,True
+bostonma,boston ma,spacing,True
+sanjose,san jose,spacing,True
+pixelwatch2,pixel watch 2,spacing,True
+openpizza,open pizza,spacing,True
+northcarolina,north carolina,spacing,True
+showevenear,show venues near,spacing,True
+galax30series,galax 30 series,spacing,True
+laptop,laptop,spacing,False
+smartphone,smartphone,spacing,False
+keyboard,keyboard,spacing,False
+monitor,monitor,spacing,False
+NYC,NYC,no_change,False
+LA,LA,no_change,False
+UK,UK,no_change,False
+vue,vue,no_change,False
+aws,aws,no_change,False
+sql,sql,no_change,False
+git,git,no_change,False
+c,c,no_change,False
+x,x,no_change,False
+r,r,no_change,False
+z,z,no_change,False
+kafka,kafka,no_change,False
+nginx,nginx,no_change,False
+vim,vim,no_change,False
+pdf,pdf,no_change,False
+xml,xml,no_change,False
+svg,svg,no_change,False
+gcp,gcp,no_change,False
+cli,cli,no_change,False
+api,api,no_change,False
+jwt,jwt,no_change,False
+mvp,mvp,no_change,False
+gdpr,gdpr,no_change,False
+crm,crm,no_change,False
+ux,ux,no_change,False
+pwa,pwa,no_change,False
+orm,orm,no_change,False

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-altair
 pandas
-streamlit

+streamlit>=1.32.0
 pandas
+symspellpy
+rapidfuzz
+python-Levenshtein
+pyspellchecker

results.csv ADDED Viewed

	@@ -0,0 +1,300 @@

+noisy,canonical,category,should_change,pred,em,cer,wer
+wheather nyc,weather nyc,single_typo,True,whether nyc,False,0.18181818181818182,0.5
+calclator,calculator,single_typo,True,calculator,True,0.0,0.0
+forcast london,forecast london,single_typo,True,forecast london,True,0.0,0.0
+temprature converter,temperature converter,single_typo,True,temperature converter,True,0.0,0.0
+restaurent near me,restaurant near me,single_typo,True,restaurant near me,True,0.0,0.0
+translater english,translator english,single_typo,True,translate english,False,0.1111111111111111,0.5
+defintion of entropy,definition of entropy,single_typo,True,definition of entropy,True,0.0,0.0
+seperate the words,separate the words,single_typo,True,separate the words,True,0.0,0.0
+accomodation paris,accommodation paris,single_typo,True,accommodation paris,True,0.0,0.0
+recieve email,receive email,single_typo,True,receive email,True,0.0,0.0
+suhsi near me,sushi near me,multi_typo,True,sushi near me,True,0.0,0.0
+restarant near me,restaurant near me,multi_typo,True,restaurant near me,True,0.0,0.0
+wether forcast today,weather forecast today,multi_typo,True,wether forecast today,False,0.045454545454545456,0.3333333333333333
+plmber emergancy,plumber emergency,multi_typo,True,plumber emergency,True,0.0,0.0
+nearist cofee shop,nearest coffee shop,multi_typo,True,nearest coffee shop,True,0.0,0.0
+cheep flihgts paris,cheap flights paris,multi_typo,True,cheep flights paris,False,0.05263157894736842,0.3333333333333333
+hotl delas nyc,hotel deals nyc,multi_typo,True,hotl delay nyc,False,0.26666666666666666,0.6666666666666666
+hosptial emergancy rm,hospital emergency room,multi_typo,True,hospital emergency rm,False,0.08695652173913043,0.3333333333333333
+bestbuyt,best buy,brand_typo,True,best buy,True,0.0,0.0
+youtueb,youtube,brand_typo,True,youtube,True,0.0,0.0
+gooogle maps,google maps,brand_typo,True,google maps,True,0.0,0.0
+amazom prime,amazon prime,brand_typo,True,amazon prime,True,0.0,0.0
+netflx login,netflix login,brand_typo,True,netflix login,True,0.0,0.0
+spotifiy premium,spotify premium,brand_typo,True,spotifiy premium,False,0.0625,0.5
+facbook login,facebook login,brand_typo,True,facebook login,True,0.0,0.0
+instagrem,instagram,brand_typo,True,instagram,True,0.0,0.0
+linkdin profile,linkedin profile,brand_typo,True,linking profile,False,0.1875,0.5
+gitub repo,github repo,brand_typo,True,tub repo,False,0.2727272727272727,0.5
+163 SQ,SQ163,flight_order,True,SQ163,True,0.0,0.0
+100 AA,AA100,flight_order,True,AA100,True,0.0,0.0
+417 BA,BA417,flight_order,True,BA417,True,0.0,0.0
+SQ 163,SQ163,flight_order,True,SQ163,True,0.0,0.0
+AA 100,AA100,flight_order,True,AA100,True,0.0,0.0
+815 DL,DL815,flight_order,True,DL815,True,0.0,0.0
+200 UA,UA200,flight_order,True,UA200,True,0.0,0.0
+flight 163 SQ,SQ163,flight_order,True,SQ163,True,0.0,0.0
+AA flight 100,AA100,flight_order,True,AA flight 100,False,0.6153846153846154,3.0
+15 iphone,iphone 15,product_order,True,iphone 15,True,0.0,0.0
+pro 14 macbook,macbook pro 14,product_order,True,macbook pro 14,True,0.0,0.0
+s24 samsung,samsung s24,product_order,True,samsung s24,True,0.0,0.0
+ultra s23 samsung,samsung s23 ultra,product_order,True,samsung ultra s23,False,0.47058823529411764,0.6666666666666666
+air 13 macbook,macbook air 13,product_order,True,macbook air 13,True,0.0,0.0
+pro ipad 12,ipad pro 12,product_order,True,ipad pro 12,True,0.0,0.0
+max pro 15 iphone,iphone 15 pro max,product_order,True,iphone max pro 15,False,0.35294117647058826,0.5
+pixel 8 google,google pixel 8,product_order,True,pixel 8 google,False,0.8571428571428571,0.6666666666666666
+tab s9 samsung,samsung tab s9,product_order,True,samsung tab s9,True,0.0,0.0
+AAPL stock,AAPL,stock_canon,True,AAPL,True,0.0,0.0
+stock TSLA,TSLA,stock_canon,True,TSLA,True,0.0,0.0
+apple aapl,AAPL,stock_canon,True,AAPL,True,0.0,0.0
+tesla stock price,TSLA,stock_canon,True,TSLA,True,0.0,0.0
+MSFT share price,MSFT,stock_canon,True,MSFT,True,0.0,0.0
+google stock GOOGL,GOOGL,stock_canon,True,GOOGL,True,0.0,0.0
+amazon AMZN stock,AMZN,stock_canon,True,AMZN,True,0.0,0.0
+meta stock FB,META,stock_canon,True,META,True,0.0,0.0
+nvda share price,NVDA,stock_canon,True,NVDA,True,0.0,0.0
+iphone15,iphone 15,spacing,True,iphone 15,True,0.0,0.0
+macbookpro,macbook pro,spacing,True,macbook pro,True,0.0,0.0
+nearme,near me,spacing,True,near me,True,0.0,0.0
+bestbuy,best buy,spacing,True,best buy,True,0.0,0.0
+newyork,new york,spacing,True,new york,True,0.0,0.0
+unitedstates,united states,spacing,True,united states,True,0.0,0.0
+wifi password,wifi password,spacing,False,wifi password,True,0.0,0.0
+hotdog,hotdog,spacing,False,hotdog,True,0.0,0.0
+appl,appl,no_change,False,appl,True,0.0,0.0
+rust,rust,no_change,False,rust,True,0.0,0.0
+delta,delta,no_change,False,delta,True,0.0,0.0
+apple,apple,no_change,False,apple,True,0.0,0.0
+python,python,no_change,False,python,True,0.0,0.0
+java,java,no_change,False,java,True,0.0,0.0
+echo,echo,no_change,False,echo,True,0.0,0.0
+spring,spring,no_change,False,spring,True,0.0,0.0
+cloud,cloud,no_change,False,cloud,True,0.0,0.0
+mercury,mercury,no_change,False,mercury,True,0.0,0.0
+npm,npm,no_change,False,npm,True,0.0,0.0
+gcc,gcc,no_change,False,gcc,True,0.0,0.0
+css,css,no_change,False,css,True,0.0,0.0
+go,go,no_change,False,go,True,0.0,0.0
+swift,swift,no_change,False,swift,True,0.0,0.0
+waether forecast tomorrow,weather forecast tomorrow,single_typo,True,whether forecast tomorrow,False,0.08,0.3333333333333333
+best pizzeria near me,best pizzeria near me,single_typo,False,best pizzeria near me,True,0.0,0.0
+how to cook pasta,how to cook pasta,single_typo,False,how to cook pasta,True,0.0,0.0
+gas stations nearby,gas stations nearby,single_typo,False,gas stations nearby,True,0.0,0.0
+resturant reservations online,restaurant reservations online,single_typo,True,restaurant reservations online,True,0.0,0.0
+puplic libraries near me,public libraries near me,single_typo,True,public libraries near me,True,0.0,0.0
+best plumber in my area,best plumber in my area,single_typo,False,best plumber in my area,True,0.0,0.0
+forcast weekend weather,forecast weekend weather,single_typo,True,forecast weekend weather,True,0.0,0.0
+how to pronounce worcester,how to pronounce worcester,single_typo,False,how to pronounce worcester,True,0.0,0.0
+recipie for chocolate cake,recipe for chocolate cake,single_typo,True,recipe for chocolate cake,True,0.0,0.0
+hardware store locator,hardware store locator,single_typo,False,hardware store locator,True,0.0,0.0
+trafic conditions now,traffic conditions now,single_typo,True,traffic conditions now,True,0.0,0.0
+vacuum cleaner reviews,vacuum cleaner reviews,single_typo,False,vacuum cleaner reviews,True,0.0,0.0
+how to spell occassion,how to spell occasion,single_typo,True,how to spell occasion,True,0.0,0.0
+dentist appointements available,dentist appointments available,single_typo,True,dentist appointments available,True,0.0,0.0
+nearest pharmacy open now,nearest pharmacy open now,single_typo,False,nearest pharmacy open now,True,0.0,0.0
+beginner gardening tips,beginner gardening tips,single_typo,False,beginner gardening tips,True,0.0,0.0
+calorie counter app,calorie counter app,single_typo,False,calorie counter app,True,0.0,0.0
+what is the defintion of serendipity,what is the definition of serendipity,single_typo,True,what is the definition of serendipity,True,0.0,0.0
+best electrician in town,best electrician in town,single_typo,False,best electrician in town,True,0.0,0.0
+humidty levels today,humidity levels today,single_typo,True,humidity levels today,True,0.0,0.0
+directions to airport,directions to airport,single_typo,False,directions to airport,True,0.0,0.0
+how to spel definitely,how to spell definitely,single_typo,True,how to spel definitely,False,0.043478260869565216,0.25
+grocery stores near me,grocery stores near me,single_typo,False,grocery stores near me,True,0.0,0.0
+buisness hours for target,business hours for target,single_typo,True,business hours for target,True,0.0,0.0
+weather in chicago tommorow,weather in chicago tomorrow,single_typo,True,weather in chicago tomorrow,True,0.0,0.0
+how to make omlette,how to make omelette,single_typo,True,how to make omelette,True,0.0,0.0
+atm machine locations,atm machine locations,single_typo,False,atm machine locations,True,0.0,0.0
+barber shop avaialble,barber shop available,single_typo,True,barber shop available,True,0.0,0.0
+best restauarant in boston,best restaurant in boston,single_typo,True,best restaurant in boston,True,0.0,0.0
+how to cook brocoli,how to cook broccoli,single_typo,True,how to cook broccoli,True,0.0,0.0
+swimming pools near me,swimming pools near me,single_typo,False,swimming pools near me,True,0.0,0.0
+seperate the documents,separate the documents,single_typo,True,separate the documents,True,0.0,0.0
+temperture in fahrenheit,temperature in fahrenheit,single_typo,True,temperature in fahrenheit,True,0.0,0.0
+parking garage nearby,parking garage nearby,single_typo,False,parking garage nearby,True,0.0,0.0
+how to make lasagna recepie,how to make lasagna recipe,single_typo,True,how to make lasagna receive,False,0.07407407407407407,0.2
+veterinary clinic hours,veterinary clinic hours,single_typo,False,veterinary clinic hours,True,0.0,0.0
+what does recieve mean,what does receive mean,single_typo,True,what does receive mean,True,0.0,0.0
+yoga classes availible now,yoga classes available now,single_typo,True,yoga classes available now,True,0.0,0.0
+begininng spanish lessons,beginning spanish lessons,single_typo,True,beginning spanish lessons,True,0.0,0.0
+plumber near me emergancy,plumber near me emergency,multi_typo,True,plumber near me emergency,True,0.0,0.0
+dentist appoitment availble,dentist appointment available,multi_typo,True,dentist appointment available,True,0.0,0.0
+electritian repaire servises,electrician repair services,multi_typo,True,electrician repair services,True,0.0,0.0
+weather forcast this weakend,weather forecast this weekend,multi_typo,True,weather forecast this weekend,True,0.0,0.0
+neerest gass station,nearest gas station,multi_typo,True,nearest gass station,False,0.05,0.3333333333333333
+hotel reservation cheep rates,hotel reservation cheap rates,multi_typo,True,hotel reservation cheep rates,False,0.034482758620689655,0.25
+autombile mechanic lokation,automobile mechanic location,multi_typo,True,automobile mechanic location,True,0.0,0.0
+humidty forcast tomorow,humidity forecast tomorrow,multi_typo,True,humidity forecast tomorrow,True,0.0,0.0
+locksmith emergancy servise,locksmith emergency service,multi_typo,True,locksmith emergency service,True,0.0,0.0
+flight ticket prices comparision,flight ticket prices comparison,multi_typo,True,flight ticket prices comparison,True,0.0,0.0
+restauant reservaton opem,restaurant reservation open,multi_typo,True,restaurant reservation opem,False,0.037037037037037035,0.3333333333333333
+carpentor contrator estimat,carpenter contractor estimate,multi_typo,True,carpenter contractor estimate,True,0.0,0.0
+tempreture alert wheather,temperature alert weather,multi_typo,True,temperature alert whether,False,0.08,0.3333333333333333
+tourist atraction guidebook,tourist attraction guidebook,multi_typo,True,tourist attraction guidebook,True,0.0,0.0
+laundry servise neerby,laundry service nearby,multi_typo,True,laundry service nearby,True,0.0,0.0
+vehcile registation renewel,vehicle registration renewal,multi_typo,True,vehicle registration renewed,False,0.07142857142857142,0.3333333333333333
+snowstrom warning forcast,snowstorm warning forecast,multi_typo,True,snowstorm warning forecast,True,0.0,0.0
+hostotel accomodation deals,hostel accommodation deals,multi_typo,True,hostel accommodation deals,True,0.0,0.0
+haircut appoitment schedul,haircut appointment schedule,multi_typo,True,haircut appointment schedule,True,0.0,0.0
+sunrise time locaton,sunrise time location,multi_typo,True,sunrise time location,True,0.0,0.0
+moving compeny quoate,moving company quote,multi_typo,True,moving company quote,True,0.0,0.0
+road conidtion trafic update,road condition traffic update,multi_typo,True,road condition traffic update,True,0.0,0.0
+veterinarian emergancy clinc,veterinarian emergency clinic,multi_typo,True,veterinarian emergency clinic,True,0.0,0.0
+vacation packge discunt availble,vacation package discount available,multi_typo,True,vacation package discount available,True,0.0,0.0
+pest contral servise lokation,pest control service location,multi_typo,True,pest control service location,True,0.0,0.0
+pollin forcast alergy,pollen forecast allergy,multi_typo,True,pollen forecast allergy,True,0.0,0.0
+airbnb accomodaton recomendation,airbnb accommodation recommendation,multi_typo,True,airing accommodation recommendation,False,0.05714285714285714,0.3333333333333333
+window cleening compny rates,window cleaning company rates,multi_typo,True,window cleaning company rates,True,0.0,0.0
+wind gust wheather alert,wind gust weather alert,multi_typo,True,wind gust whether alert,False,0.08695652173913043,0.25
+rentral car comparision price,rental car comparison price,multi_typo,True,central car comparison price,False,0.07142857142857142,0.25
+goggle.com,google.com,brand_typo,True,goggle com,False,0.2,2.0
+amazn.com,amazon.com,brand_typo,True,amazon com,False,0.1,2.0
+spotfiy music,spotify music,brand_typo,True,spotty music,False,0.15384615384615385,0.5
+instgram app,instagram app,brand_typo,True,ingram app,False,0.23076923076923078,0.5
+gitub profile,github profile,brand_typo,True,tub profile,False,0.21428571428571427,0.5
+redditt.com,reddit.com,brand_typo,True,reddish com,False,0.2727272727272727,2.0
+twiter feed,twitter feed,brand_typo,True,twitter feed,True,0.0,0.0
+linkdin jobs,linkedin jobs,brand_typo,True,linking jobs,False,0.23076923076923078,0.5
+microsodt office,microsoft office,brand_typo,True,microsoft office,True,0.0,0.0
+adoobe creative,adobe creative,brand_typo,True,adobe creative,True,0.0,0.0
+dropbx files,dropbox files,brand_typo,True,drop files,False,0.23076923076923078,0.5
+zom meeting,zoom meeting,brand_typo,True,zom meeting,False,0.08333333333333333,0.5
+slck workspace,slack workspace,brand_typo,True,slck workspace,False,0.06666666666666667,0.5
+paypa checkout,paypal checkout,brand_typo,True,papa checkout,False,0.13333333333333333,0.5
+ebya auction,ebay auction,brand_typo,True,ebya auction,False,0.16666666666666666,0.5
+wallmart groceries,walmart groceries,brand_typo,True,walmart groceries,True,0.0,0.0
+targat deals,target deals,brand_typo,True,target deals,True,0.0,0.0
+nytimez news,nytimes news,brand_typo,True,anytime news,False,0.16666666666666666,0.5
+bbc.co.uk,bbc.com,brand_typo,True,bic co us,False,0.5555555555555556,3.0
+cnn breaking,cnn breaking news,brand_typo,True,cnn breaking,False,0.29411764705882354,0.3333333333333333
+youtub video,youtube video,brand_typo,True,youtube video,True,0.0,0.0
+netflic series,netflix series,brand_typo,True,netflix series,True,0.0,0.0
+googl drive,google drive,brand_typo,True,GOOGL,False,0.5833333333333334,1.0
+amzon shopping,amazon shopping,brand_typo,True,amazon shopping,True,0.0,0.0
+spotiffy playlist,spotify playlist,brand_typo,True,spiffy playlets,False,0.375,1.0
+facebk messenger,facebook messenger,brand_typo,True,face messenger,False,0.2222222222222222,0.5
+insta stories,instagram stories,brand_typo,True,instar stories,False,0.17647058823529413,0.5
+gihub code,github code,brand_typo,True,hub code,False,0.2727272727272727,0.5
+reddot forum,reddit forum,brand_typo,True,redo forum,False,0.25,0.5
+BA 287,BA287,flight_order,True,BA287,True,0.0,0.0
+502 DL,DL502,flight_order,True,DL502,True,0.0,0.0
+flight UA 441,UA441,flight_order,True,UA441,True,0.0,0.0
+lh 156,LH156,flight_order,True,LH156,True,0.0,0.0
+273 AF,AF273,flight_order,True,AF273,True,0.0,0.0
+EK  89,EK89,flight_order,True,EK89,True,0.0,0.0
+621qr,QR621,flight_order,True,QR621,True,0.0,0.0
+CX 884,CX884,flight_order,True,CX884,True,0.0,0.0
+345 vs,VS345,flight_order,True,VS345,True,0.0,0.0
+KL 714,KL714,flight_order,True,KL714,True,0.0,0.0
+193ib,IB193,flight_order,True,IB193,True,0.0,0.0
+TK 427,TK427,flight_order,True,TK427,True,0.0,0.0
+flight 556 AA,AA556,flight_order,True,AA556,True,0.0,0.0
+738 ba,BA738,flight_order,True,BA738,True,0.0,0.0
+DL 212,DL212,flight_order,True,DL212,True,0.0,0.0
+84ua,UA84,flight_order,True,UA84,True,0.0,0.0
+AF  609,AF609,flight_order,True,AF609,True,0.0,0.0
+445 ek,EK445,flight_order,True,EK445,True,0.0,0.0
+sq 267,SQ267,flight_order,True,SQ267,True,0.0,0.0
+572 CX,CX572,flight_order,True,CX572,True,0.0,0.0
+flight vs314,VS314,flight_order,True,flight vs314,False,0.5833333333333334,1.0
+981 KL,KL981,flight_order,True,KL981,True,0.0,0.0
+IB   456,IB456,flight_order,True,IB456,True,0.0,0.0
+tk 103,TK103,flight_order,True,TK103,True,0.0,0.0
+890 SQ,SQ890,flight_order,True,SQ890,True,0.0,0.0
+13 air macbook,macbook air 13,product_order,True,macbook 13 air,False,0.42857142857142855,0.6666666666666666
+pro iphone 15,iphone 15 pro,product_order,True,iphone pro 15,False,0.46153846153846156,0.6666666666666666
+8 pixel google,google pixel 8,product_order,True,pixel 8 google,False,0.8571428571428571,0.6666666666666666
+ultra 24 s23 samsung,samsung s23 ultra,product_order,True,samsung ultra 24 s23,False,0.55,1.0
+fold 5 samsung galaxy,samsung galaxy fold 5,product_order,True,samsung fold 5 galaxy,False,0.47619047619047616,0.5
+max 14 pro iphone,iphone 14 pro max,product_order,True,iphone max 14 pro,False,0.47058823529411764,0.5
+16 macbook pro,macbook pro 16,product_order,True,macbook 16 pro,False,0.42857142857142855,0.6666666666666666
+pixel pro 7 google,google pixel 7 pro,product_order,True,pixel pro 7 google,False,0.7777777777777778,0.75
+tab s9 samsung galaxy,samsung galaxy tab s9,product_order,True,samsung tab s9 galaxy,False,0.47619047619047616,0.5
+12 mini iphone,iphone 12 mini,product_order,True,iphone 12 mini,True,0.0,0.0
+z fold 4 samsung,samsung galaxy z fold 4,product_order,True,samsung z fold 4,False,0.30434782608695654,0.2
+watch series 9 apple,apple watch series 9,product_order,True,watch series 9 apple,False,0.6,0.5
+xl pixel 8 google,google pixel 8 xl,product_order,True,pixel xl 8 google,False,0.7647058823529411,0.75
+s24 ultra samsung,samsung s24 ultra,product_order,True,samsung s24 ultra,True,0.0,0.0
+15 macbook air,macbook air 15,product_order,True,macbook 15 air,False,0.42857142857142855,0.6666666666666666
+iphone pro 13,iphone 13 pro,product_order,True,iphone pro 13,False,0.46153846153846156,0.6666666666666666
+flip 5 z samsung,samsung galaxy z flip 5,product_order,True,samsung flip 5 z,False,0.4782608695652174,0.6
+7 series watch apple,apple watch series 7,product_order,True,7 series watch apple,False,0.7,1.0
+a15 oneplus,oneplus a15,product_order,True,a15 onerous,False,0.9090909090909091,1.0
+pad air 11 ipad,ipad air 11,product_order,True,ipad pad air 11,False,0.26666666666666666,0.3333333333333333
+ultra 15 iphone pro,iphone 15 pro max,product_order,True,iphone ultra 15 pro,False,0.5263157894736842,0.5
+note 24 galaxy samsung,samsung galaxy note 24,product_order,True,samsung note 24 galaxy,False,0.6363636363636364,0.5
+11 pro max iphone,iphone 11 pro max,product_order,True,iphone 11 pro max,True,0.0,0.0
+studio display apple,apple studio display,product_order,True,studio display apple,False,0.6,0.6666666666666666
+x1 carbon lenovo thinkpad,lenovo thinkpad x1 carbon,product_order,True,x1 carbon lenore thinkpad,False,0.88,1.0
+AAPL stock price,AAPL,stock_canon,True,AAPL,True,0.0,0.0
+tesla share price,TSLA,stock_canon,True,TSLA,True,0.0,0.0
+MSFT earnings,MSFT,stock_canon,True,MSFT,True,0.0,0.0
+google GOOGL stock,GOOGL,stock_canon,True,GOOGL,True,0.0,0.0
+AMZN share,AMZN,stock_canon,True,AMZN,True,0.0,0.0
+amazon price AMZN,AMZN,stock_canon,True,AMZN,True,0.0,0.0
+META stock price,META,stock_canon,True,META,True,0.0,0.0
+nvidia NVDA,NVDA,stock_canon,True,NVDA,True,0.0,0.0
+NFLX share price,NFLX,stock_canon,True,NFLX,True,0.0,0.0
+netflix stock NFLX,NFLX,stock_canon,True,NFLX,True,0.0,0.0
+PYPL price,PYPL,stock_canon,True,PYPL,True,0.0,0.0
+paypal PYPL stock,PYPL,stock_canon,True,PYPL,True,0.0,0.0
+SNAP stock,SNAP,stock_canon,True,SNAP,True,0.0,0.0
+snapchat SNAP,SNAP,stock_canon,True,SNAP,True,0.0,0.0
+AMD share price,AMD,stock_canon,True,AMD,True,0.0,0.0
+amd processor stock,AMD,stock_canon,True,AMD,True,0.0,0.0
+INTC earnings,INTC,stock_canon,True,INTC,True,0.0,0.0
+intel INTC stock,INTC,stock_canon,True,INTC,True,0.0,0.0
+QCOM price,QCOM,stock_canon,True,QCOM price,False,0.6,1.0
+qualcomm QCOM,QCOM,stock_canon,True,qualcomm QCOM,False,0.6923076923076923,1.0
+UBER stock price,UBER,stock_canon,True,UBER,True,0.0,0.0
+lyft LYFT share,LYFT,stock_canon,True,LYFT,True,0.0,0.0
+airbnb ABNB stock,ABNB,stock_canon,True,ABNB,True,0.0,0.0
+iphone15pro,iphone 15 pro,spacing,True,iphone pro,False,0.23076923076923078,0.3333333333333333
+samsungz9,samsung z9,spacing,True,samsung,False,0.3,0.5
+ipadair,ipad air,spacing,True,ipad air,True,0.0,0.0
+losangeles,los angeles,spacing,True,los angeles,True,0.0,0.0
+sanfrancisco,san francisco,spacing,True,san francisco,True,0.0,0.0
+nearbyshops,nearby shops,spacing,True,nearby shops,True,0.0,0.0
+dellxps13,dell xps 13,spacing,True,dell see,False,0.45454545454545453,0.6666666666666666
+surfacelaptopp5,surface laptop p5,spacing,True,surface laptop,False,0.17647058823529413,0.3333333333333333
+newyorkpizza,new york pizza,spacing,True,new pizza,False,0.35714285714285715,0.3333333333333333
+holmescompany,holmes company,spacing,True,holmes company,True,0.0,0.0
+findme,find me,spacing,True,find me,True,0.0,0.0
+bostonma,boston ma,spacing,True,boston a,False,0.1111111111111111,0.5
+sanjose,san jose,spacing,True,san jose,True,0.0,0.0
+pixelwatch2,pixel watch 2,spacing,True,pixel watch,False,0.15384615384615385,0.3333333333333333
+openpizza,open pizza,spacing,True,open pizza,True,0.0,0.0
+northcarolina,north carolina,spacing,True,north carolina,True,0.0,0.0
+showevenear,show venues near,spacing,True,showed near,False,0.375,0.6666666666666666
+galax30series,galax 30 series,spacing,True,galaxy series,False,0.2,0.6666666666666666
+laptop,laptop,spacing,False,laptop,True,0.0,0.0
+smartphone,smartphone,spacing,False,smartphone,True,0.0,0.0
+keyboard,keyboard,spacing,False,keyboard,True,0.0,0.0
+monitor,monitor,spacing,False,monitor,True,0.0,0.0
+NYC,NYC,no_change,False,NYC,True,0.0,0.0
+LA,LA,no_change,False,LA,True,0.0,0.0
+UK,UK,no_change,False,UK,True,0.0,0.0
+vue,vue,no_change,False,vue,True,0.0,0.0
+aws,aws,no_change,False,aws,True,0.0,0.0
+sql,sql,no_change,False,sql,True,0.0,0.0
+git,git,no_change,False,git,True,0.0,0.0
+c,c,no_change,False,c,True,0.0,0.0
+x,x,no_change,False,x,True,0.0,0.0
+r,r,no_change,False,r,True,0.0,0.0
+z,z,no_change,False,z,True,0.0,0.0
+kafka,kafka,no_change,False,kafka,True,0.0,0.0
+nginx,nginx,no_change,False,nine,False,0.4,1.0
+vim,vim,no_change,False,vim,True,0.0,0.0
+pdf,pdf,no_change,False,pdf,True,0.0,0.0
+xml,xml,no_change,False,xml,True,0.0,0.0
+svg,svg,no_change,False,svg,True,0.0,0.0
+gcp,gcp,no_change,False,gcp,True,0.0,0.0
+cli,cli,no_change,False,cli,True,0.0,0.0
+api,api,no_change,False,api,True,0.0,0.0
+jwt,jwt,no_change,False,jwt,True,0.0,0.0
+mvp,mvp,no_change,False,mvp,True,0.0,0.0
+gdpr,gdpr,no_change,False,gdpr,True,0.0,0.0
+crm,crm,no_change,False,crm,True,0.0,0.0
+ux,ux,no_change,False,ux,True,0.0,0.0
+pwa,pwa,no_change,False,pwa,True,0.0,0.0
+orm,orm,no_change,False,orm,True,0.0,0.0