vazish commited on
Commit
a9c2bf4
·
unverified ·
1 Parent(s): cce5dde
Files changed (2) hide show
  1. build_vocab.py +80 -0
  2. orcas_vocab.txt +0 -0
build_vocab.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Build a SymSpell-compatible frequency dictionary from ORCAS-I queries.
3
+
4
+ Extracts all words from 2M search queries, counts frequencies, and saves a
5
+ vocab file that SymSpell can load alongside its default English dictionary.
6
+ This gives SymSpell domain coverage it lacks — food terms, place names,
7
+ local services, brand names etc. that appear in real search queries.
8
+
9
+ Usage:
10
+ python3 build_vocab.py
11
+ python3 build_vocab.py --input ~/Desktop/intent_classification/ORCAS-I-2M.tsv --min-freq 10
12
+
13
+ Output:
14
+ orcas_vocab.txt (word<TAB>frequency, one per line)
15
+ """
16
+
17
+ import re
18
+ import argparse
19
+ from pathlib import Path
20
+ from collections import Counter
21
+
22
+ import pandas as pd
23
+
24
+ DESKTOP = Path.home() / "Desktop"
25
+ DEFAULT_IN = DESKTOP / "intent_classification" / "ORCAS-I-2M.tsv"
26
+ DEFAULT_OUT = Path(__file__).parent / "orcas_vocab.txt"
27
+
28
+ # Tokens to skip — numbers, single chars, and tokens that look like
29
+ # structured entities (all-caps tickers, flight codes) which we don't
30
+ # want SymSpell treating as "correct" spellings for correction targets.
31
+ _SKIP_RE = re.compile(r'^(\d+|[a-z]|[A-Z]{2,5}\d+|\d+[A-Z]{2,3})$')
32
+
33
+
34
+ def build_vocab(input_path: Path, output_path: Path, min_freq: int) -> None:
35
+ print(f"Loading {input_path} ...")
36
+ df = pd.read_csv(input_path, sep="\t", usecols=["query"],
37
+ dtype=str, na_filter=False)
38
+ print(f" {len(df):,} queries loaded")
39
+
40
+ print("Counting word frequencies ...")
41
+ counter: Counter = Counter()
42
+ for query in df["query"]:
43
+ for tok in query.lower().split():
44
+ # Strip leading/trailing punctuation
45
+ tok = tok.strip(".,!?;:\"'()[]{}")
46
+ if len(tok) >= 2 and not _SKIP_RE.match(tok) and tok.isalpha():
47
+ counter[tok] += 1
48
+
49
+ before = len(counter)
50
+ counter = Counter({w: c for w, c in counter.items() if c >= min_freq})
51
+ print(f" {before:,} unique tokens → {len(counter):,} after min_freq={min_freq} filter")
52
+
53
+ output_path.parent.mkdir(parents=True, exist_ok=True)
54
+ with open(output_path, "w", encoding="utf-8") as f:
55
+ for word, freq in counter.most_common():
56
+ f.write(f"{word}\t{freq}\n")
57
+
58
+ print(f"Saved to {output_path}")
59
+ print(f"\nTop 30 terms:")
60
+ for word, freq in counter.most_common(30):
61
+ print(f" {word:<20} {freq:>8,}")
62
+
63
+
64
+ def main():
65
+ parser = argparse.ArgumentParser()
66
+ parser.add_argument("--input", default=str(DEFAULT_IN), type=Path)
67
+ parser.add_argument("--output", default=str(DEFAULT_OUT), type=Path)
68
+ parser.add_argument("--min-freq", default=5, type=int,
69
+ help="Minimum word frequency to include (default: 5)")
70
+ args = parser.parse_args()
71
+
72
+ if not args.input.exists():
73
+ raise FileNotFoundError(f"ORCAS-I file not found: {args.input}\n"
74
+ f"Pass the correct path with --input")
75
+
76
+ build_vocab(args.input, args.output, args.min_freq)
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()
orcas_vocab.txt ADDED
The diff for this file is too large to render. See raw diff