query_norm / build_vocab.py
vazish's picture
vocab
a9c2bf4 unverified
"""
Build a SymSpell-compatible frequency dictionary from ORCAS-I queries.
Extracts all words from 2M search queries, counts frequencies, and saves a
vocab file that SymSpell can load alongside its default English dictionary.
This gives SymSpell domain coverage it lacks β€” food terms, place names,
local services, brand names etc. that appear in real search queries.
Usage:
python3 build_vocab.py
python3 build_vocab.py --input ~/Desktop/intent_classification/ORCAS-I-2M.tsv --min-freq 10
Output:
orcas_vocab.txt (word<TAB>frequency, one per line)
"""
import re
import argparse
from pathlib import Path
from collections import Counter
import pandas as pd
DESKTOP = Path.home() / "Desktop"
DEFAULT_IN = DESKTOP / "intent_classification" / "ORCAS-I-2M.tsv"
DEFAULT_OUT = Path(__file__).parent / "orcas_vocab.txt"
# Tokens to skip β€” numbers, single chars, and tokens that look like
# structured entities (all-caps tickers, flight codes) which we don't
# want SymSpell treating as "correct" spellings for correction targets.
_SKIP_RE = re.compile(r'^(\d+|[a-z]|[A-Z]{2,5}\d+|\d+[A-Z]{2,3})$')
def build_vocab(input_path: Path, output_path: Path, min_freq: int) -> None:
print(f"Loading {input_path} ...")
df = pd.read_csv(input_path, sep="\t", usecols=["query"],
dtype=str, na_filter=False)
print(f" {len(df):,} queries loaded")
print("Counting word frequencies ...")
counter: Counter = Counter()
for query in df["query"]:
for tok in query.lower().split():
# Strip leading/trailing punctuation
tok = tok.strip(".,!?;:\"'()[]{}")
if len(tok) >= 2 and not _SKIP_RE.match(tok) and tok.isalpha():
counter[tok] += 1
before = len(counter)
counter = Counter({w: c for w, c in counter.items() if c >= min_freq})
print(f" {before:,} unique tokens β†’ {len(counter):,} after min_freq={min_freq} filter")
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for word, freq in counter.most_common():
f.write(f"{word}\t{freq}\n")
print(f"Saved to {output_path}")
print(f"\nTop 30 terms:")
for word, freq in counter.most_common(30):
print(f" {word:<20} {freq:>8,}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", default=str(DEFAULT_IN), type=Path)
parser.add_argument("--output", default=str(DEFAULT_OUT), type=Path)
parser.add_argument("--min-freq", default=5, type=int,
help="Minimum word frequency to include (default: 5)")
args = parser.parse_args()
if not args.input.exists():
raise FileNotFoundError(f"ORCAS-I file not found: {args.input}\n"
f"Pass the correct path with --input")
build_vocab(args.input, args.output, args.min_freq)
if __name__ == "__main__":
main()