Spaces:
Running
Running
vocab
Browse files- build_vocab.py +80 -0
- orcas_vocab.txt +0 -0
build_vocab.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Build a SymSpell-compatible frequency dictionary from ORCAS-I queries.
|
| 3 |
+
|
| 4 |
+
Extracts all words from 2M search queries, counts frequencies, and saves a
|
| 5 |
+
vocab file that SymSpell can load alongside its default English dictionary.
|
| 6 |
+
This gives SymSpell domain coverage it lacks — food terms, place names,
|
| 7 |
+
local services, brand names etc. that appear in real search queries.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python3 build_vocab.py
|
| 11 |
+
python3 build_vocab.py --input ~/Desktop/intent_classification/ORCAS-I-2M.tsv --min-freq 10
|
| 12 |
+
|
| 13 |
+
Output:
|
| 14 |
+
orcas_vocab.txt (word<TAB>frequency, one per line)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import re
|
| 18 |
+
import argparse
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from collections import Counter
|
| 21 |
+
|
| 22 |
+
import pandas as pd
|
| 23 |
+
|
| 24 |
+
DESKTOP = Path.home() / "Desktop"
|
| 25 |
+
DEFAULT_IN = DESKTOP / "intent_classification" / "ORCAS-I-2M.tsv"
|
| 26 |
+
DEFAULT_OUT = Path(__file__).parent / "orcas_vocab.txt"
|
| 27 |
+
|
| 28 |
+
# Tokens to skip — numbers, single chars, and tokens that look like
|
| 29 |
+
# structured entities (all-caps tickers, flight codes) which we don't
|
| 30 |
+
# want SymSpell treating as "correct" spellings for correction targets.
|
| 31 |
+
_SKIP_RE = re.compile(r'^(\d+|[a-z]|[A-Z]{2,5}\d+|\d+[A-Z]{2,3})$')
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def build_vocab(input_path: Path, output_path: Path, min_freq: int) -> None:
|
| 35 |
+
print(f"Loading {input_path} ...")
|
| 36 |
+
df = pd.read_csv(input_path, sep="\t", usecols=["query"],
|
| 37 |
+
dtype=str, na_filter=False)
|
| 38 |
+
print(f" {len(df):,} queries loaded")
|
| 39 |
+
|
| 40 |
+
print("Counting word frequencies ...")
|
| 41 |
+
counter: Counter = Counter()
|
| 42 |
+
for query in df["query"]:
|
| 43 |
+
for tok in query.lower().split():
|
| 44 |
+
# Strip leading/trailing punctuation
|
| 45 |
+
tok = tok.strip(".,!?;:\"'()[]{}")
|
| 46 |
+
if len(tok) >= 2 and not _SKIP_RE.match(tok) and tok.isalpha():
|
| 47 |
+
counter[tok] += 1
|
| 48 |
+
|
| 49 |
+
before = len(counter)
|
| 50 |
+
counter = Counter({w: c for w, c in counter.items() if c >= min_freq})
|
| 51 |
+
print(f" {before:,} unique tokens → {len(counter):,} after min_freq={min_freq} filter")
|
| 52 |
+
|
| 53 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 54 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 55 |
+
for word, freq in counter.most_common():
|
| 56 |
+
f.write(f"{word}\t{freq}\n")
|
| 57 |
+
|
| 58 |
+
print(f"Saved to {output_path}")
|
| 59 |
+
print(f"\nTop 30 terms:")
|
| 60 |
+
for word, freq in counter.most_common(30):
|
| 61 |
+
print(f" {word:<20} {freq:>8,}")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def main():
|
| 65 |
+
parser = argparse.ArgumentParser()
|
| 66 |
+
parser.add_argument("--input", default=str(DEFAULT_IN), type=Path)
|
| 67 |
+
parser.add_argument("--output", default=str(DEFAULT_OUT), type=Path)
|
| 68 |
+
parser.add_argument("--min-freq", default=5, type=int,
|
| 69 |
+
help="Minimum word frequency to include (default: 5)")
|
| 70 |
+
args = parser.parse_args()
|
| 71 |
+
|
| 72 |
+
if not args.input.exists():
|
| 73 |
+
raise FileNotFoundError(f"ORCAS-I file not found: {args.input}\n"
|
| 74 |
+
f"Pass the correct path with --input")
|
| 75 |
+
|
| 76 |
+
build_vocab(args.input, args.output, args.min_freq)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
main()
|
orcas_vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|