Spaces:

AgamP
/

llm_recommendation_backend

Sleeping

llm_recommendation_backend / scripts /build_role_vocab.py

github-actions

Sync from GitHub 2025-12-17T12:18:53Z

5a3b322 4 months ago

3.55 kB

	from __future__ import annotations

	import argparse
	import json
	from collections import Counter
	from pathlib import Path

	import pandas as pd

	from retrieval.query_rewriter import tokenize
	from retrieval.query_rewriter import TECH_SKILLS, SOFT_SKILLS, ROLE_HINTS


	CATALOG_STOPWORDS = {
	"test",
	"tests",
	"assessment",
	"assessments",
	"knowledge",
	"skills",
	"minutes",
	"minute",
	"duration",
	"remote",
	"adaptive",
	"professional",
	"individual",
	"contributor",
	"languages",
	"language",
	"job",
	"levels",
	"level",
	"description",
	"sample",
	}


	def build_role_vocab(
	catalog_path: str,
	top_k: int = 100,
	min_len: int = 4,
	min_df_ratio: float = 0.05,
	max_df_ratio: float = 0.4,
	) -> dict:
	df = pd.read_json(catalog_path, lines=True) if catalog_path.endswith(".jsonl") else pd.read_parquet(catalog_path)
	texts = []
	if "doc_text" in df.columns:
	texts = df["doc_text"].astype(str).tolist()
	else:
	# fallback to name + description
	names = df.get("name", "").astype(str)
	desc = df.get("description", "").astype(str)
	texts = (names + " " + desc).tolist()

	doc_freq = Counter()
	N = len(texts)
	for txt in texts:
	seen = set()
	for tok in tokenize(txt):
	if len(tok) >= min_len and tok not in CATALOG_STOPWORDS:
	seen.add(tok)
	for tok in seen:
	doc_freq[tok] += 1

	min_df = max(1, int(N * min_df_ratio))
	max_df = max(1, int(N * max_df_ratio))
	filtered = [(tok, df) for tok, df in doc_freq.items() if min_df <= df <= max_df]
	# Sort by DF descending
	filtered.sort(key=lambda x: x[1], reverse=True)
	tokens = [tok for tok, _ in filtered[:top_k]]

	technical = [t for t in tokens if t in TECH_SKILLS]
	behavioral = [t for t in tokens if t in SOFT_SKILLS]
	roles = [t for t in tokens if t in ROLE_HINTS]
	generic = [t for t in tokens if t not in technical and t not in behavioral and t not in roles]

	return {
	"all": tokens,
	"technical": technical,
	"behavioral": behavioral,
	"roles": roles,
	"generic": generic,
	"meta": {
	"total_docs": N,
	"min_df": min_df,
	"max_df": max_df,
	"top_k": top_k,
	},
	}


	def main():
	parser = argparse.ArgumentParser(description="Build a simple role/keyword vocab from catalog.")
	parser.add_argument("--catalog", required=True, help="Path to catalog JSONL/parquet with doc_text or name/description.")
	parser.add_argument("--out", default="data/catalog_role_vocab.json", help="Output JSON file.")
	parser.add_argument("--top-k", type=int, default=100, help="Top-k tokens to keep.")
	parser.add_argument("--min-len", type=int, default=4, help="Minimum token length.")
	parser.add_argument("--min-df-ratio", type=float, default=0.05, help="Min document frequency ratio to keep a token.")
	parser.add_argument("--max-df-ratio", type=float, default=0.4, help="Max document frequency ratio to keep a token.")
	args = parser.parse_args()

	vocab = build_role_vocab(
	args.catalog,
	top_k=args.top_k,
	min_len=args.min_len,
	min_df_ratio=args.min_df_ratio,
	max_df_ratio=args.max_df_ratio,
	)
	Path(args.out).parent.mkdir(parents=True, exist_ok=True)
	with open(args.out, "w") as f:
	json.dump(vocab, f, indent=2)
	print(f"Saved vocab to {args.out}")


	if __name__ == "__main__":
	main()