llm_recommendation_backend / scripts /build_role_vocab.py
github-actions
Sync from GitHub 2025-12-17T12:18:53Z
5a3b322
from __future__ import annotations
import argparse
import json
from collections import Counter
from pathlib import Path
import pandas as pd
from retrieval.query_rewriter import tokenize
from retrieval.query_rewriter import TECH_SKILLS, SOFT_SKILLS, ROLE_HINTS
CATALOG_STOPWORDS = {
"test",
"tests",
"assessment",
"assessments",
"knowledge",
"skills",
"minutes",
"minute",
"duration",
"remote",
"adaptive",
"professional",
"individual",
"contributor",
"languages",
"language",
"job",
"levels",
"level",
"description",
"sample",
}
def build_role_vocab(
catalog_path: str,
top_k: int = 100,
min_len: int = 4,
min_df_ratio: float = 0.05,
max_df_ratio: float = 0.4,
) -> dict:
df = pd.read_json(catalog_path, lines=True) if catalog_path.endswith(".jsonl") else pd.read_parquet(catalog_path)
texts = []
if "doc_text" in df.columns:
texts = df["doc_text"].astype(str).tolist()
else:
# fallback to name + description
names = df.get("name", "").astype(str)
desc = df.get("description", "").astype(str)
texts = (names + " " + desc).tolist()
doc_freq = Counter()
N = len(texts)
for txt in texts:
seen = set()
for tok in tokenize(txt):
if len(tok) >= min_len and tok not in CATALOG_STOPWORDS:
seen.add(tok)
for tok in seen:
doc_freq[tok] += 1
min_df = max(1, int(N * min_df_ratio))
max_df = max(1, int(N * max_df_ratio))
filtered = [(tok, df) for tok, df in doc_freq.items() if min_df <= df <= max_df]
# Sort by DF descending
filtered.sort(key=lambda x: x[1], reverse=True)
tokens = [tok for tok, _ in filtered[:top_k]]
technical = [t for t in tokens if t in TECH_SKILLS]
behavioral = [t for t in tokens if t in SOFT_SKILLS]
roles = [t for t in tokens if t in ROLE_HINTS]
generic = [t for t in tokens if t not in technical and t not in behavioral and t not in roles]
return {
"all": tokens,
"technical": technical,
"behavioral": behavioral,
"roles": roles,
"generic": generic,
"meta": {
"total_docs": N,
"min_df": min_df,
"max_df": max_df,
"top_k": top_k,
},
}
def main():
parser = argparse.ArgumentParser(description="Build a simple role/keyword vocab from catalog.")
parser.add_argument("--catalog", required=True, help="Path to catalog JSONL/parquet with doc_text or name/description.")
parser.add_argument("--out", default="data/catalog_role_vocab.json", help="Output JSON file.")
parser.add_argument("--top-k", type=int, default=100, help="Top-k tokens to keep.")
parser.add_argument("--min-len", type=int, default=4, help="Minimum token length.")
parser.add_argument("--min-df-ratio", type=float, default=0.05, help="Min document frequency ratio to keep a token.")
parser.add_argument("--max-df-ratio", type=float, default=0.4, help="Max document frequency ratio to keep a token.")
args = parser.parse_args()
vocab = build_role_vocab(
args.catalog,
top_k=args.top_k,
min_len=args.min_len,
min_df_ratio=args.min_df_ratio,
max_df_ratio=args.max_df_ratio,
)
Path(args.out).parent.mkdir(parents=True, exist_ok=True)
with open(args.out, "w") as f:
json.dump(vocab, f, indent=2)
print(f"Saved vocab to {args.out}")
if __name__ == "__main__":
main()