Spaces:
Sleeping
Sleeping
| """ | |
| Build a SymSpell-compatible frequency dictionary from ORCAS-I queries. | |
| Extracts all words from 2M search queries, counts frequencies, and saves a | |
| vocab file that SymSpell can load alongside its default English dictionary. | |
| This gives SymSpell domain coverage it lacks β food terms, place names, | |
| local services, brand names etc. that appear in real search queries. | |
| Usage: | |
| python3 build_vocab.py | |
| python3 build_vocab.py --input ~/Desktop/intent_classification/ORCAS-I-2M.tsv --min-freq 10 | |
| Output: | |
| orcas_vocab.txt (word<TAB>frequency, one per line) | |
| """ | |
| import re | |
| import argparse | |
| from pathlib import Path | |
| from collections import Counter | |
| import pandas as pd | |
| DESKTOP = Path.home() / "Desktop" | |
| DEFAULT_IN = DESKTOP / "intent_classification" / "ORCAS-I-2M.tsv" | |
| DEFAULT_OUT = Path(__file__).parent / "orcas_vocab.txt" | |
| # Tokens to skip β numbers, single chars, and tokens that look like | |
| # structured entities (all-caps tickers, flight codes) which we don't | |
| # want SymSpell treating as "correct" spellings for correction targets. | |
| _SKIP_RE = re.compile(r'^(\d+|[a-z]|[A-Z]{2,5}\d+|\d+[A-Z]{2,3})$') | |
| def build_vocab(input_path: Path, output_path: Path, min_freq: int) -> None: | |
| print(f"Loading {input_path} ...") | |
| df = pd.read_csv(input_path, sep="\t", usecols=["query"], | |
| dtype=str, na_filter=False) | |
| print(f" {len(df):,} queries loaded") | |
| print("Counting word frequencies ...") | |
| counter: Counter = Counter() | |
| for query in df["query"]: | |
| for tok in query.lower().split(): | |
| # Strip leading/trailing punctuation | |
| tok = tok.strip(".,!?;:\"'()[]{}") | |
| if len(tok) >= 2 and not _SKIP_RE.match(tok) and tok.isalpha(): | |
| counter[tok] += 1 | |
| before = len(counter) | |
| counter = Counter({w: c for w, c in counter.items() if c >= min_freq}) | |
| print(f" {before:,} unique tokens β {len(counter):,} after min_freq={min_freq} filter") | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| for word, freq in counter.most_common(): | |
| f.write(f"{word}\t{freq}\n") | |
| print(f"Saved to {output_path}") | |
| print(f"\nTop 30 terms:") | |
| for word, freq in counter.most_common(30): | |
| print(f" {word:<20} {freq:>8,}") | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--input", default=str(DEFAULT_IN), type=Path) | |
| parser.add_argument("--output", default=str(DEFAULT_OUT), type=Path) | |
| parser.add_argument("--min-freq", default=5, type=int, | |
| help="Minimum word frequency to include (default: 5)") | |
| args = parser.parse_args() | |
| if not args.input.exists(): | |
| raise FileNotFoundError(f"ORCAS-I file not found: {args.input}\n" | |
| f"Pass the correct path with --input") | |
| build_vocab(args.input, args.output, args.min_freq) | |
| if __name__ == "__main__": | |
| main() | |