Spaces:

vazish
/

query_norm

Sleeping

App Files Files Community

vazish commited on Feb 27

Commit

a9c2bf4

unverified ·

1 Parent(s): cce5dde

vocab

Browse files

Files changed (2) hide show

build_vocab.py +80 -0
orcas_vocab.txt +0 -0

build_vocab.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+Build a SymSpell-compatible frequency dictionary from ORCAS-I queries.
+Extracts all words from 2M search queries, counts frequencies, and saves a
+vocab file that SymSpell can load alongside its default English dictionary.
+This gives SymSpell domain coverage it lacks — food terms, place names,
+local services, brand names etc. that appear in real search queries.
+Usage:
+  python3 build_vocab.py
+  python3 build_vocab.py --input ~/Desktop/intent_classification/ORCAS-I-2M.tsv --min-freq 10
+Output:
+  orcas_vocab.txt  (word<TAB>frequency, one per line)
+"""
+import re
+import argparse
+from pathlib import Path
+from collections import Counter
+import pandas as pd
+DESKTOP      = Path.home() / "Desktop"
+DEFAULT_IN   = DESKTOP / "intent_classification" / "ORCAS-I-2M.tsv"
+DEFAULT_OUT  = Path(__file__).parent / "orcas_vocab.txt"
+# Tokens to skip — numbers, single chars, and tokens that look like
+# structured entities (all-caps tickers, flight codes) which we don't
+# want SymSpell treating as "correct" spellings for correction targets.
+_SKIP_RE = re.compile(r'^(\d+|[a-z]|[A-Z]{2,5}\d+|\d+[A-Z]{2,3})$')
+def build_vocab(input_path: Path, output_path: Path, min_freq: int) -> None:
+    print(f"Loading {input_path} ...")
+    df = pd.read_csv(input_path, sep="\t", usecols=["query"],
+                     dtype=str, na_filter=False)
+    print(f"  {len(df):,} queries loaded")
+    print("Counting word frequencies ...")
+    counter: Counter = Counter()
+    for query in df["query"]:
+        for tok in query.lower().split():
+            # Strip leading/trailing punctuation
+            tok = tok.strip(".,!?;:\"'()[]{}")
+            if len(tok) >= 2 and not _SKIP_RE.match(tok) and tok.isalpha():
+                counter[tok] += 1
+    before = len(counter)
+    counter = Counter({w: c for w, c in counter.items() if c >= min_freq})
+    print(f"  {before:,} unique tokens → {len(counter):,} after min_freq={min_freq} filter")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for word, freq in counter.most_common():
+            f.write(f"{word}\t{freq}\n")
+    print(f"Saved to {output_path}")
+    print(f"\nTop 30 terms:")
+    for word, freq in counter.most_common(30):
+        print(f"  {word:<20} {freq:>8,}")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input",    default=str(DEFAULT_IN),  type=Path)
+    parser.add_argument("--output",   default=str(DEFAULT_OUT), type=Path)
+    parser.add_argument("--min-freq", default=5,                type=int,
+                        help="Minimum word frequency to include (default: 5)")
+    args = parser.parse_args()
+    if not args.input.exists():
+        raise FileNotFoundError(f"ORCAS-I file not found: {args.input}\n"
+                                f"Pass the correct path with --input")
+    build_vocab(args.input, args.output, args.min_freq)
+if __name__ == "__main__":
+    main()

orcas_vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff