import nltk nltk.download('punkt_tab', quiet=True) nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) # ── Compatibility shim for JAAT 0.7.x + transformers ≥ 4.45 ───────────────── # JAAT's WageExtract.__init__ calls # AutoModelForTokenClassification.from_pretrained(..., max_length=128, ...) # which newer transformers rejects ("unexpected keyword argument 'max_length'") # because the kwarg is routed into the model subclass __init__. `max_length` # is a tokenizer-side option and safe to drop for the model load. from transformers import AutoModelForTokenClassification as _AMTC _orig_amtc_fp = _AMTC.from_pretrained def _patched_amtc_fp(*args, **kwargs): kwargs.pop("max_length", None) return _orig_amtc_fp(*args, **kwargs) _AMTC.from_pretrained = _patched_amtc_fp # ── TitleMatch compat: DebertaV2Tokenizer.batch_encode_plus ───────────────── # JAAT.TitleMatch.get_title calls feature_tokenizer.batch_encode_plus(...). # Newer transformers versions raise "DebertaV2Tokenizer has no attribute # batch_encode_plus" for the slow DebertaV2 tokenizer. `__call__` accepts the # same kwargs and returns the same BatchEncoding, so alias it. try: from transformers.models.deberta_v2.tokenization_deberta_v2 import DebertaV2Tokenizer if not hasattr(DebertaV2Tokenizer, "batch_encode_plus"): def _dv2_batch_encode_plus(self, batch_text_or_text_pairs, **kwargs): return self(batch_text_or_text_pairs, **kwargs) DebertaV2Tokenizer.batch_encode_plus = _dv2_batch_encode_plus except Exception as _e: print(f"[titlematch-shim] could not patch DebertaV2Tokenizer: {_e}") import gradio as gr from sentence_transformers import util as _st_util, SentenceTransformer as _ST from nltk.tokenize import sent_tokenize as _sent_tokenize import time # ── Embedding cache shim ──────────────────────────────────────────────────── # JAAT.TaskMatch / SkillMatch / AIMatch each call embedding_model.encode() on # a large fixed corpus inside __init__. On CPU that's ~30 min of encoding at # every cold start — enough to fail the Space's 30-min launch health check. # We precompute those tensors once and host them at pnorlander/jaat-embeddings; # here we download them plus the corresponding corpus lists, and monkey-patch # SentenceTransformer.encode so that when JAAT feeds it a known corpus we # return the cached tensor instead of re-encoding. Query-time encode() calls # (small inputs) fall through to the real encoder. # # Lookup is value-based (not hash-based), so the cache tolerates any corpus # ordering — e.g. SkillMatch builds its list via `list(set(...))` whose # iteration order varies between Python versions. import json import os EMBEDDINGS_REPO_ID = os.environ.get("JAAT_EMBEDDINGS_REPO", "pnorlander/jaat-embeddings") # Each entry: {"pos": {item_text: row_idx}, "tensor": } _emb_caches = [] def _load_embedding_cache(): import torch from huggingface_hub import hf_hub_download try: manifest_path = hf_hub_download(repo_id=EMBEDDINGS_REPO_ID, filename="manifest.json") except Exception as e: print(f"[emb-cache] manifest download failed ({e}); falling back to live encoding.") return with open(manifest_path) as f: manifest = json.load(f) for name, entry in manifest.items(): corpus_file = entry.get("corpus_file") if not corpus_file: print(f"[emb-cache] {name}: no corpus_file in manifest; skipping") continue try: pt_path = hf_hub_download(repo_id=EMBEDDINGS_REPO_ID, filename=entry["file"]) corpus_path = hf_hub_download(repo_id=EMBEDDINGS_REPO_ID, filename=corpus_file) tensor = torch.load(pt_path, map_location="cpu", weights_only=True) with open(corpus_path) as f: corpus = json.load(f) if len(corpus) != tensor.shape[0]: print(f"[emb-cache] {name}: corpus/tensor size mismatch " f"({len(corpus)} vs {tensor.shape[0]}); skipping") continue pos = {item: i for i, item in enumerate(corpus)} _emb_caches.append({"name": name, "pos": pos, "tensor": tensor}) print(f"[emb-cache] loaded {name}: {tuple(tensor.shape)} ({len(pos)} items)") except Exception as e: print(f"[emb-cache] {name} failed: {e}") _load_embedding_cache() _orig_encode = _ST.encode def _patched_encode(self, sentences, *args, **kwargs): import torch if isinstance(sentences, list) and len(sentences) >= 100 and _emb_caches: for entry in _emb_caches: pos = entry["pos"] if all(s in pos for s in sentences): idxs = [pos[s] for s in sentences] reordered = entry["tensor"][torch.tensor(idxs)] print(f"[emb-cache] HIT on {entry['name']} ({len(sentences)} items) — skipping encode") if kwargs.get("convert_to_tensor", False): return reordered return reordered.numpy() return _orig_encode(self, sentences, *args, **kwargs) _ST.encode = _patched_encode from JAAT import JAAT # Newer sentence-transformers ships gte-small/gte-large weights as float16, # so SentenceTransformer.encode() returns float16 tensors. Corpus embeddings # (from pickle or from the precomputed cache) are float32. torch.mm refuses # to mix dtypes, so we cast every embedding model to float32 after init. _orig_titlematch_init = JAAT.TitleMatch.__init__ def _patched_titlematch_init(self, *args, **kwargs): _orig_titlematch_init(self, *args, **kwargs) import torch self.title_embed = self.title_embed.to(torch.float32) self.embedding_model = self.embedding_model.float() print(f"[titlematch-shim] title_embed & embedding_model → float32") JAAT.TitleMatch.__init__ = _patched_titlematch_init # ── Initialize JAAT modules once at startup ────────────────────────────────── print("Loading JAAT modules (this may take a moment)...") task_matcher = JAAT.TaskMatch() title_matcher = JAAT.TitleMatch() firm_extractor = JAAT.FirmExtract() wage_extractor = JAAT.WageExtract() skill_matcher = JAAT.SkillMatch() ai_matcher = JAAT.AIMatch() import torch as _torch for _m in (task_matcher, skill_matcher, ai_matcher): _m.embedding_model = _m.embedding_model.float() task_matcher.task_embed = task_matcher.task_embed.to(_torch.float32) skill_matcher.skill_embed = skill_matcher.skill_embed.to(_torch.float32) ai_matcher.ai_embed = ai_matcher.ai_embed.to(_torch.float32) print("[dtype-shim] TaskMatch/SkillMatch/AIMatch embedding models & corpus → float32") JOBTAG_CLASSES = [ "CitizenshipReq", "GovContract", "VisaExclude", "VisaInclude", "WorkAuthReq", "driverslicense", "ind_contractor", "proflicenses", "wfh", "yesunion", ] TAG_LABELS = { "CitizenshipReq": "Citizenship Required", "GovContract": "Government Contract", "VisaExclude": "Visa Excluded", "VisaInclude": "Visa Sponsorship", "WorkAuthReq": "Work Auth Required", "driverslicense": "Driver's License", "ind_contractor": "Independent Contractor", "proflicenses": "Professional License", "wfh": "Work From Home", "yesunion": "Union Position", } job_taggers = {cls: JAAT.JobTag(class_name=cls) for cls in JOBTAG_CLASSES} print("All modules loaded. Ready!") def format_status(tool_states): """Build a markdown status summary of the pipeline.""" icons = {"pending": "⏳", "running": "🔄", "done": "✅", "error": "❌"} lines = [] for tool, state in tool_states.items(): lines.append(f"{icons[state]} **{tool}** — {state}") return "\n\n".join(lines) # ── Line-by-line attribution ──────────────────────────────────────────────── # JAAT's TaskMatch/SkillMatch/AIMatch all split the input with the same # preprocessing before classifying each sentence. Replicating that split lets # us line the candidate positives back up with their source sentence. def _jaat_split(text): t = ". ".join(text.split("\n")) for a, b in [(";", "."), (" + ", ". "), (" * ", ". "), (" - ", ". "), (" • ", ". "), (" · ", ". "), ("--", ". "), ("**", ". ")]: t = t.replace(a, b) return _sent_tokenize(t.strip()) def _attribute(sentences, matcher, corpus_attr, label_fn, threshold, min_words=0, max_words=10**9): """Run matcher.embedding_model + semantic_search over `sentences` against `getattr(matcher, corpus_attr)`, returning {sentence_idx: [labels]}. `label_fn(corpus_id)` produces the display string for a hit.""" idxs, texts = [], [] for i, s in enumerate(sentences): wc = len(s.split()) if min_words <= wc <= max_words: idxs.append(i) texts.append(s) if not texts: return {} # Classify sentences as positive candidates preds = list(matcher.pipe(texts)) positive_pairs = [(i, t) for (i, t), p in zip(zip(idxs, texts), preds) if p.get("label") == "LABEL_1"] if not positive_pairs: return {} pos_texts = [t for _, t in positive_pairs] q = matcher.embedding_model.encode(pos_texts, convert_to_tensor=True, batch_size=64) if getattr(matcher, "device", "cpu") == "cuda": q = q.to("cuda") hits = _st_util.semantic_search( corpus_embeddings=getattr(matcher, corpus_attr), query_embeddings=q, top_k=1, ) out = {} for (sent_idx, _), h in zip(positive_pairs, hits): score = h[0]["score"] if score >= threshold: out.setdefault(sent_idx, []).append(label_fn(h[0]["corpus_id"])) return out def _attribution_table(job_text): from concurrent.futures import ThreadPoolExecutor sents = _jaat_split(job_text) if not sents: return "_No sentences to attribute._" with ThreadPoolExecutor(max_workers=3) as pool: task_fut = pool.submit( _attribute, sents, task_matcher, "task_embed", lambda cid: ( f"**{task_matcher.tasks.iloc[cid]['Task ID']}** " f"{task_matcher.tasks.iloc[cid]['Task']}" ), task_matcher.threshold, 1, 48, ) skill_fut = pool.submit( _attribute, sents, skill_matcher, "skill_embed", lambda cid: ( f"**{skill_matcher.skill_map[skill_matcher.skills[cid]]}** " f"{skill_matcher.skills[cid]}" ), skill_matcher.threshold, 1, 48, ) ai_fut = pool.submit( _attribute, sents, ai_matcher, "ai_embed", lambda cid: ( f"**{ai_matcher.ai_map[ai_matcher.ai[cid]]}** " f"{ai_matcher.ai[cid]}" ), ai_matcher.threshold, 4, 64, ) task_hits = task_fut.result() skill_hits = skill_fut.result() ai_hits = ai_fut.result() def esc(s): return s.replace("|", "\\|").replace("\n", " ").strip() lines = [ "| # | Sentence | Tasks (O*NET) | Skills (ESCO) | AI Concepts |", "|---|----------|---------------|---------------|-------------|", ] for i, s in enumerate(sents): t = "
".join(task_hits.get(i, [])) or "—" sk = "
".join(skill_hits.get(i, [])) or "—" a = "
".join(ai_hits.get(i, [])) or "—" lines.append(f"| {i+1} | {esc(s)} | {t} | {sk} | {a} |") return "\n".join(lines) def analyze(job_title, job_text, mode="Summary", progress=gr.Progress(track_tqdm=True)): line_by_line = (mode == "Line-by-line") if not job_text.strip(): yield "Please paste a job ad first.", "", "", "", "", "", "", "", "" return tools = ["FirmExtract", "WageExtract", "TitleMatch", "TaskMatch", "SkillMatch", "AIMatch", "JobTag"] states = {t: "pending" for t in tools} firm_out = "" wage_out = "" title_out = "" task_out = "" skill_out = "" ai_out = "" tag_out = "" line_out = "" # ── FirmExtract ────────────────────────────────────────────────────── states["FirmExtract"] = "running" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out try: tagged = firm_extractor.pipe(job_text) firm = firm_extractor.extract_firm(tagged, return_one=True, return_score=False) firm_out = firm if firm else "Not detected" states["FirmExtract"] = "done" except Exception as e: firm_out = f"Error: {e}" states["FirmExtract"] = "error" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out # ── WageExtract ────────────────────────────────────────────────────── states["WageExtract"] = "running" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out try: wage = wage_extractor.get_wage(job_text) if isinstance(wage, dict) and wage: parts = [] if wage.get("min"): try: parts.append(f"**Min:** ${float(wage['min'].replace(',','')):,.2f}") except (ValueError, AttributeError): parts.append(f"**Min:** {wage['min']}") if wage.get("max"): try: parts.append(f"**Max:** ${float(wage['max'].replace(',','')):,.2f}") except (ValueError, AttributeError): parts.append(f"**Max:** {wage['max']}") if wage.get("frequency"): parts.append(f"**Frequency:** {wage['frequency']}") wage_out = " | ".join(parts) if parts else "Not found" elif isinstance(wage, str): wage_out = wage else: wage_out = "Not found" states["WageExtract"] = "done" except Exception as e: wage_out = f"Error: {e}" states["WageExtract"] = "error" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out # ── TitleMatch ─────────────────────────────────────────────────────── states["TitleMatch"] = "running" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out if job_title.strip(): try: titles = title_matcher.get_title(job_title.strip()) if titles: t = titles[0] # get_title returns (onet_code, score, value, features) onet_code = t[0] score_pct = f"{float(t[1]) * 100:.1f}%" onet_url = f"https://www.onetonline.org/link/summary/{onet_code}" title_out = ( f"**O*NET Code:** [{onet_code}]({onet_url})\n\n" f"**Score:** {score_pct}" ) else: title_out = "No match found" states["TitleMatch"] = "done" except Exception as e: title_out = f"Error: {e}" states["TitleMatch"] = "error" else: title_out = "No job title provided (enter one above to use TitleMatch)" states["TitleMatch"] = "done" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out # ── TaskMatch ──────────────────────────────────────────────────────── states["TaskMatch"] = "running" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out if line_by_line: task_out = "_See Line-by-line Attribution below._" states["TaskMatch"] = "done" else: try: tasks = task_matcher.get_tasks(job_text) if tasks: lines = [f"| {t[0]} | {t[1]} |" for t in tasks] task_out = "| Task ID | Description |\n|---------|-------------|\n" + "\n".join(lines) else: task_out = "No O*NET tasks matched in this ad." states["TaskMatch"] = "done" except Exception as e: task_out = f"Error: {e}" states["TaskMatch"] = "error" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out # ── SkillMatch ─────────────────────────────────────────────────────── states["SkillMatch"] = "running" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out if line_by_line: skill_out = "_See Line-by-line Attribution below._" states["SkillMatch"] = "done" else: try: skills = skill_matcher.get_skills(job_text) if skills: lines = [f"| {s[1]} | {s[0]} |" for s in skills] skill_out = "| ESCO Code | Skill |\n|-----------|-------|\n" + "\n".join(lines) else: skill_out = "No skills matched in this ad." states["SkillMatch"] = "done" except Exception as e: skill_out = f"Error: {e}" states["SkillMatch"] = "error" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out # ── AIMatch ───────────────────────────────────────────────────────── states["AIMatch"] = "running" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out if line_by_line: ai_out = "_See Line-by-line Attribution below._" states["AIMatch"] = "done" else: try: ai_result = ai_matcher.get_ai(job_text) if ai_result and isinstance(ai_result, (list, tuple)) and len(ai_result) >= 3: matched_ai, count, avg_score, binary_scores, match_scores = ai_result if matched_ai: # Sort by last 5 digits of code descending indexed = list(zip(matched_ai, match_scores)) indexed.sort(key=lambda x: x[0][1][-5:], reverse=True) lines = [] for (statement, code), ms in indexed: lines.append(f"| {code} | {statement} | {ms} |") ai_out = ( f"**AI Concepts Found:** {count} | **Avg Score:** {avg_score}\n\n" "| Code | Statement | Match Score |\n|------|-----------|-------------|\n" + "\n".join(lines) ) else: ai_out = "No AI-related concepts detected." else: ai_out = "No AI-related concepts detected." states["AIMatch"] = "done" except Exception as e: ai_out = f"Error: {e}" states["AIMatch"] = "error" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out # ── JobTag ─────────────────────────────────────────────────────────── states["JobTag"] = "running" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out try: tag_lines = [] for cls, tagger in job_taggers.items(): pred = tagger.get_tag(job_text) detected = bool(pred[1]) label = TAG_LABELS.get(cls, cls) icon = "✅" if detected else "—" tag_lines.append(f"| {label} | {icon} |") tag_out = "| Attribute | Detected |\n|-----------|----------|\n" + "\n".join(tag_lines) states["JobTag"] = "done" except Exception as e: tag_out = f"Error: {e}" states["JobTag"] = "error" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out # ── Line-by-line Attribution (Tasks/Skills/AI) ────────────────────── if line_by_line: try: line_out = _attribution_table(job_text) except Exception as e: line_out = f"Error building attribution table: {e}" yield format_status(states), firm_out, wage_out, title_out, task_out, skill_out, ai_out, tag_out, line_out CITATION = """**Software & Data Citation** If you use JAAT in your research, please cite: ```bibtex @article{meisenbacher2025extracting, title={Extracting O*NET Features from the NLx Corpus to Build Public Use Aggregate Labor Market Data}, author={Meisenbacher, Stephen and Nestorov, Svetlozar and Norlander, Peter}, journal={arXiv preprint arXiv:2510.01470}, year={2025} } ``` """ with gr.Blocks(title="JAAT — Job Ad Analysis Toolkit") as demo: gr.Markdown(""" # JAAT — Job Ad Analysis Toolkit Paste a job advertisement to extract O*NET tasks, skills, title match, firm name, wages, and job tags. [GitHub](https://github.com/Job-Ad-Research-at-QSB-LUC/JAAT) """) with gr.Row(): with gr.Column(scale=2): job_title = gr.Textbox( label="Job Title (used by TitleMatch)", placeholder='e.g. "Software Engineer" or "Registered Nurse"', lines=1, ) job_text = gr.Textbox( label="Full Job Advertisement Text", placeholder="Paste the full text of a job posting here, then click Analyze...", lines=12, ) mode = gr.Radio( choices=["Summary", "Line-by-line"], value="Summary", label="Display mode", info="Summary = unique matches in tables. Line-by-line = each " "sentence shown with the Tasks / Skills / AI Concepts it " "triggered.", ) analyze_btn = gr.Button("Analyze", variant="primary") with gr.Column(scale=1): pipeline_status = gr.Markdown("Pipeline status will appear here.") gr.Markdown("---") gr.Markdown("### Results") with gr.Row(): with gr.Column(): gr.Markdown("**FirmExtract**") firm_output = gr.Markdown() with gr.Column(): gr.Markdown("**WageExtract**") wage_output = gr.Markdown() with gr.Column(): gr.Markdown("**TitleMatch**") title_output = gr.Markdown() with gr.Accordion("TaskMatch — O*NET Tasks", open=True): task_output = gr.Markdown() with gr.Accordion("SkillMatch — ESCO Skills", open=True): skill_output = gr.Markdown() with gr.Accordion("AIMatch — AI Concepts", open=True): ai_output = gr.Markdown() with gr.Accordion("JobTag — Job Attributes", open=True): tag_output = gr.Markdown() with gr.Accordion( "Line-by-line Attribution (Tasks / Skills / AI)", open=True, ): line_output = gr.Markdown( "Select **Line-by-line** mode and click Analyze to see each " "sentence of the ad paired with the O*NET tasks, ESCO skills, " "and AI concepts it triggered." ) gr.Markdown(CITATION) analyze_btn.click( fn=analyze, inputs=[job_title, job_text, mode], outputs=[pipeline_status, firm_output, wage_output, title_output, task_output, skill_output, ai_output, tag_output, line_output], ) demo.launch(server_name="0.0.0.0", server_port=7860)