{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "cea0d83b-7dfe-4bfb-b9ca-6d3114dd3a7a",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"HF_TOKEN\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "257f2b76-59ca-498e-97ac-fdb543d7a47f",
"metadata": {},
"outputs": [],
"source": [
"pip install datasets huggingface_hub transformers sentencepiece tiktoken protobuf pandas pyarrow wandb"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "376e8042-91ac-4690-a956-5d598e9677c4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded to:\n",
" train: /workspace/.cache/huggingface/hub/datasets--akzaidan--Job_Data_Parser/snapshots/30fb9953206f668375dd28297f330bcb909e12cd/train.parquet\n",
" val: /workspace/.cache/huggingface/hub/datasets--akzaidan--Job_Data_Parser/snapshots/30fb9953206f668375dd28297f330bcb909e12cd/val.parquet\n",
" test: /workspace/.cache/huggingface/hub/datasets--akzaidan--Job_Data_Parser/snapshots/30fb9953206f668375dd28297f330bcb909e12cd/test.parquet\n"
]
}
],
"source": [
"from huggingface_hub import hf_hub_download\n",
"import pandas as pd\n",
"\n",
"REPO_ID = \"akzaidan/Job_Data_Parser\"\n",
"\n",
"# Download only the 3 parquet files (no full dataset clone needed)\n",
"train_path = hf_hub_download(repo_id=REPO_ID, filename=\"train.parquet\", repo_type=\"dataset\")\n",
"val_path = hf_hub_download(repo_id=REPO_ID, filename=\"val.parquet\", repo_type=\"dataset\")\n",
"test_path = hf_hub_download(repo_id=REPO_ID, filename=\"test.parquet\", repo_type=\"dataset\")\n",
"\n",
"print(\"Downloaded to:\")\n",
"print(\" train:\", train_path)\n",
"print(\" val: \", val_path)\n",
"print(\" test: \", test_path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fccc2393-3220-4e4d-ab03-41702bc00c96",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train: 249,379 rows\n",
"Val: 14,964 rows\n",
"Test: 9,973 rows\n"
]
}
],
"source": [
"train_df = pd.read_parquet(train_path)\n",
"val_df = pd.read_parquet(val_path)\n",
"test_df = pd.read_parquet(test_path)\n",
"\n",
"print(f\"Train: {len(train_df):,} rows\")\n",
"print(f\"Val: {len(val_df):,} rows\")\n",
"print(f\"Test: {len(test_df):,} rows\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e04be49b-1b94-45aa-bb5d-047edb74a7f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['text', 'pay_lower', 'pay_upper', 'expected_experience_years']\n",
"text str\n",
"pay_lower int64\n",
"pay_upper int64\n",
"expected_experience_years int64\n",
"dtype: object\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" pay_lower | \n",
" pay_upper | \n",
" expected_experience_years | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" [TITLE]: Facilities Officer [DESC]: **37 hours... | \n",
" 30000 | \n",
" 30000 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" [TITLE]: Floor refinishing technician [DESC]: ... | \n",
" 33280 | \n",
" 37440 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" [TITLE]: Registrar in Haematology - Immediate ... | \n",
" 80000 | \n",
" 110000 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text pay_lower pay_upper \\\n",
"0 [TITLE]: Facilities Officer [DESC]: **37 hours... 30000 30000 \n",
"1 [TITLE]: Floor refinishing technician [DESC]: ... 33280 37440 \n",
"2 [TITLE]: Registrar in Haematology - Immediate ... 80000 110000 \n",
"\n",
" expected_experience_years \n",
"0 0 \n",
"1 0 \n",
"2 3 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(train_df.columns.tolist())\n",
"print(train_df.dtypes)\n",
"train_df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9b7cc3d4-cff8-480f-b124-e35c88248891",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train after dropping missing: 249,379\n",
"Val after dropping missing: 14,964\n",
"Test after dropping missing: 9,973\n",
"\n",
"norm_stats (SAVE THESE):\n",
" expected_experience_years: mean=2.8786, std=2.7134\n",
" pay_lower: mean=70610.1505, std=54165.9840\n",
" pay_upper: mean=92701.2608, std=69885.4088\n",
"\n",
"Normalized target distributions:\n",
" expected_experience_years pay_lower pay_upper\n",
"count 249379.000 249379.000 249379.000\n",
"mean -0.000 0.000 0.000\n",
"std 1.000 1.000 1.000\n",
"min -1.061 -0.750 -0.897\n",
"25% -0.692 -0.651 -0.683\n",
"50% -0.324 -0.196 -0.225\n",
"75% 0.782 0.266 0.319\n",
"max 6.310 7.927 5.828\n"
]
}
],
"source": [
"import numpy as np\n",
"from datasets import Dataset, DatasetDict\n",
"\n",
"# Drop rows where ANY label is -1 (missing)\n",
"def drop_missing(df):\n",
" return df[\n",
" (df[\"expected_experience_years\"] != -1) &\n",
" (df[\"pay_lower\"] != -1) &\n",
" (df[\"pay_upper\"] != -1)\n",
" ].reset_index(drop=True)\n",
"\n",
"train_clean = drop_missing(train_df)\n",
"val_clean = drop_missing(val_df)\n",
"test_clean = drop_missing(test_df)\n",
"\n",
"print(f\"Train after dropping missing: {len(train_clean):,}\")\n",
"print(f\"Val after dropping missing: {len(val_clean):,}\")\n",
"print(f\"Test after dropping missing: {len(test_clean):,}\")\n",
"\n",
"# Z-score normalize all 3 targets using ONLY train stats\n",
"TARGET_COLS = [\"expected_experience_years\", \"pay_lower\", \"pay_upper\"]\n",
"\n",
"norm_stats = {}\n",
"for col in TARGET_COLS:\n",
" mean = train_clean[col].mean()\n",
" std = train_clean[col].std()\n",
" norm_stats[col] = {\"mean\": mean, \"std\": std}\n",
"\n",
" train_clean[col] = (train_clean[col] - mean) / std\n",
" val_clean[col] = (val_clean[col] - mean) / std\n",
" test_clean[col] = (test_clean[col] - mean) / std\n",
"\n",
"# Print stats so you can save them for inference later\n",
"print(\"\\nnorm_stats (SAVE THESE):\")\n",
"for col, stats in norm_stats.items():\n",
" print(f\" {col}: mean={stats['mean']:.4f}, std={stats['std']:.4f}\")\n",
"\n",
"# Sanity check — all 3 columns should now be centered near 0\n",
"print(\"\\nNormalized target distributions:\")\n",
"print(train_clean[TARGET_COLS].describe().round(3))\n",
"\n",
"# Convert to HuggingFace DatasetDict\n",
"dataset = DatasetDict({\n",
" \"train\": Dataset.from_pandas(train_clean),\n",
" \"val\": Dataset.from_pandas(val_clean),\n",
" \"test\": Dataset.from_pandas(test_clean),\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "504d2542-3efa-4498-afec-3cd0d9184395",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n",
"GPU: NVIDIA GeForce RTX 5090\n"
]
}
],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"from transformers import AutoTokenizer, AutoModel\n",
"\n",
"MODEL_NAME = \"microsoft/deberta-v3-small\"\n",
"MAX_LENGTH = 512 # DeBERTa-v3-small's max token length\n",
"DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"print(f\"Using device: {DEVICE}\")\n",
"print(f\"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "facf872f-9e95-49e8-9a1d-48b875eb4b6a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Torch version: 2.8.0+cu128\n",
"CUDA available: True\n",
"CUDA version torch was built with: 12.8\n",
"Tensor on GPU: tensor([1.], device='cuda:0')\n"
]
}
],
"source": [
"import torch\n",
"\n",
"print(f\"Torch version: {torch.__version__}\")\n",
"print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
"print(f\"CUDA version torch was built with: {torch.version.cuda}\")\n",
"\n",
"if torch.cuda.is_available():\n",
" x = torch.tensor([1.0]).cuda()\n",
" print(f\"Tensor on GPU: {x}\")\n",
"else:\n",
" print(\"CUDA not available — check torch build\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "cf05f57f-affb-4215-95e8-8a095b2c9d69",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "dc832c3c14c04c9488e1f3691e3d996a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tokenizing: 0%| | 0/249379 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e67735248dac4e528a094dd3b74c98a9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tokenizing: 0%| | 0/14964 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0e6d819254de422f95bfc051704725a5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tokenizing: 0%| | 0/9973 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tokenization complete:\n",
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['text', 'pay_lower', 'pay_upper', 'expected_experience_years', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 249379\n",
" })\n",
" val: Dataset({\n",
" features: ['text', 'pay_lower', 'pay_upper', 'expected_experience_years', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 14964\n",
" })\n",
" test: Dataset({\n",
" features: ['text', 'pay_lower', 'pay_upper', 'expected_experience_years', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 9973\n",
" })\n",
"})\n"
]
}
],
"source": [
"from transformers import DebertaV2Tokenizer\n",
"\n",
"tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)\n",
"\n",
"def tokenize(batch):\n",
" return tokenizer(\n",
" batch[\"text\"],\n",
" padding=\"max_length\",\n",
" truncation=True,\n",
" max_length=MAX_LENGTH,\n",
" )\n",
"\n",
"# Apply tokenizer to all splits\n",
"dataset_tokenized = dataset.map(\n",
" tokenize,\n",
" batched=True,\n",
" batch_size=1000,\n",
" desc=\"Tokenizing\"\n",
")\n",
"\n",
"# Set format for PyTorch — include the 3 label columns\n",
"dataset_tokenized.set_format(\n",
" type=\"torch\",\n",
" columns=[\"input_ids\", \"attention_mask\", \"token_type_ids\",\n",
" \"expected_experience_years\", \"pay_lower\", \"pay_upper\"]\n",
")\n",
"\n",
"print(\"Tokenization complete:\")\n",
"print(dataset_tokenized)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "ec81b063-c981-41b3-8cbf-3494fe0cebab",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d759d71463d548db9d08694997917dca",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading weights: 0%| | 0/102 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[1mDebertaV2Model LOAD REPORT\u001b[0m from: microsoft/deberta-v3-small\n",
"Key | Status | | \n",
"----------------------------------------+------------+--+-\n",
"lm_predictions.lm_head.LayerNorm.bias | UNEXPECTED | | \n",
"mask_predictions.classifier.weight | UNEXPECTED | | \n",
"mask_predictions.classifier.bias | UNEXPECTED | | \n",
"mask_predictions.LayerNorm.bias | UNEXPECTED | | \n",
"mask_predictions.dense.bias | UNEXPECTED | | \n",
"lm_predictions.lm_head.bias | UNEXPECTED | | \n",
"lm_predictions.lm_head.dense.weight | UNEXPECTED | | \n",
"lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | | \n",
"mask_predictions.LayerNorm.weight | UNEXPECTED | | \n",
"mask_predictions.dense.weight | UNEXPECTED | | \n",
"lm_predictions.lm_head.dense.bias | UNEXPECTED | | \n",
"\n",
"\u001b[3mNotes:\n",
"- UNEXPECTED\u001b[3m\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"JobRegressorModel(\n",
" (encoder): DebertaV2Model(\n",
" (embeddings): DebertaV2Embeddings(\n",
" (word_embeddings): Embedding(128100, 768, padding_idx=0)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (encoder): DebertaV2Encoder(\n",
" (layer): ModuleList(\n",
" (0-5): 6 x DebertaV2Layer(\n",
" (attention): DebertaV2Attention(\n",
" (self): DisentangledSelfAttention(\n",
" (query_proj): Linear(in_features=768, out_features=768, bias=True)\n",
" (key_proj): Linear(in_features=768, out_features=768, bias=True)\n",
" (value_proj): Linear(in_features=768, out_features=768, bias=True)\n",
" (pos_dropout): Dropout(p=0.1, inplace=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): DebertaV2SelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): DebertaV2Intermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): DebertaV2Output(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (rel_embeddings): Embedding(512, 768)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)\n",
" )\n",
" )\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (head_experience): Sequential(\n",
" (0): Linear(in_features=768, out_features=256, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Dropout(p=0.1, inplace=False)\n",
" (3): Linear(in_features=256, out_features=1, bias=True)\n",
" )\n",
" (head_pay_lower): Sequential(\n",
" (0): Linear(in_features=768, out_features=256, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Dropout(p=0.1, inplace=False)\n",
" (3): Linear(in_features=256, out_features=1, bias=True)\n",
" )\n",
" (head_pay_upper): Sequential(\n",
" (0): Linear(in_features=768, out_features=256, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Dropout(p=0.1, inplace=False)\n",
" (3): Linear(in_features=256, out_features=1, bias=True)\n",
" )\n",
")\n",
"\n",
"Trainable parameters: 141,895,683\n"
]
}
],
"source": [
"class JobRegressorModel(nn.Module):\n",
" def __init__(self, model_name, dropout=0.1):\n",
" super().__init__()\n",
"\n",
" # Shared DeBERTa encoder backbone\n",
" self.encoder = AutoModel.from_pretrained(model_name)\n",
" hidden_size = self.encoder.config.hidden_size # 768 for deberta-v3-small\n",
"\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" # 3 independent regression heads\n",
" self.head_experience = nn.Sequential(\n",
" nn.Linear(hidden_size, 256),\n",
" nn.GELU(),\n",
" nn.Dropout(dropout),\n",
" nn.Linear(256, 1)\n",
" )\n",
" self.head_pay_lower = nn.Sequential(\n",
" nn.Linear(hidden_size, 256),\n",
" nn.GELU(),\n",
" nn.Dropout(dropout),\n",
" nn.Linear(256, 1)\n",
" )\n",
" self.head_pay_upper = nn.Sequential(\n",
" nn.Linear(hidden_size, 256),\n",
" nn.GELU(),\n",
" nn.Dropout(dropout),\n",
" nn.Linear(256, 1)\n",
" )\n",
"\n",
" def forward(self, input_ids, attention_mask, token_type_ids=None):\n",
" outputs = self.encoder(\n",
" input_ids=input_ids,\n",
" attention_mask=attention_mask,\n",
" token_type_ids=token_type_ids\n",
" )\n",
" \n",
" # Cast to float32 before regression heads\n",
" cls_embedding = self.dropout(outputs.last_hidden_state[:, 0, :]).float()\n",
" \n",
" exp = self.head_experience(cls_embedding).squeeze(-1)\n",
" lower = self.head_pay_lower(cls_embedding).squeeze(-1)\n",
" upper = self.head_pay_upper(cls_embedding).squeeze(-1)\n",
" \n",
" return exp, lower, upper\n",
"\n",
"model = JobRegressorModel(MODEL_NAME).to(DEVICE).float()\n",
"print(model)\n",
"\n",
"# Sanity check — count trainable parameters\n",
"total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
"print(f\"\\nTrainable parameters: {total_params:,}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72dec9cf-4805-4f47-ade2-5f10c1cb85d9",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9875a16a-c620-4433-850a-31cb03a3d1c1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wandb_v1_\n"
]
}
],
"source": [
"wandb_key = \"wandb_v1_\"\n",
"print(wandb_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2b47af66-3517-440e-ae03-2cb4a74929c7",
"metadata": {},
"outputs": [],
"source": [
"import wandb\n",
"\n",
"wandb.login(key=wandb_key)\n",
"\n",
"wandb.init(\n",
" project=\"job-parser\",\n",
" name=\"deberta-v3-small-run1\",\n",
" config={\n",
" \"model\": \"microsoft/deberta-v3-small\",\n",
" \"epochs\": 4,\n",
" \"batch_size\": 32,\n",
" \"learning_rate\": 2e-5,\n",
" \"max_length\": 512,\n",
" \"train_samples\": len(dataset[\"train\"]),\n",
" \"val_samples\": len(dataset[\"val\"]),\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "0bc2aefc-afb7-4212-88da-e0fdfa735e61",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train batches: 7,794\n",
"Val batches: 234\n"
]
}
],
"source": [
"from torch.utils.data import DataLoader\n",
"\n",
"def collate_fn(batch):\n",
" return {\n",
" \"input_ids\": torch.stack([x[\"input_ids\"] for x in batch]),\n",
" \"attention_mask\": torch.stack([x[\"attention_mask\"] for x in batch]),\n",
" \"token_type_ids\": torch.stack([x[\"token_type_ids\"] for x in batch]),\n",
" \"expected_experience_years\": torch.tensor([x[\"expected_experience_years\"] for x in batch], dtype=torch.float32),\n",
" \"pay_lower\": torch.tensor([x[\"pay_lower\"] for x in batch], dtype=torch.float32),\n",
" \"pay_upper\": torch.tensor([x[\"pay_upper\"] for x in batch], dtype=torch.float32),\n",
" }\n",
"\n",
"train_loader = DataLoader(dataset_tokenized[\"train\"], batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=4, pin_memory=True)\n",
"val_loader = DataLoader(dataset_tokenized[\"val\"], batch_size=64, shuffle=False, collate_fn=collate_fn, num_workers=4, pin_memory=True)\n",
"\n",
"print(f\"Train batches: {len(train_loader):,}\")\n",
"print(f\"Val batches: {len(val_loader):,}\")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "03964a4a-fb55-41d9-b85f-b10229537087",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total training steps: 31,176\n",
"Warmup steps: 500\n"
]
}
],
"source": [
"from torch.optim import AdamW\n",
"from transformers import get_linear_schedule_with_warmup\n",
"\n",
"EPOCHS = 4\n",
"LR = 2e-5\n",
"WARMUP_STEPS = 500\n",
"TOTAL_STEPS = len(train_loader) * EPOCHS\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=LR, weight_decay=0.01)\n",
"\n",
"scheduler = get_linear_schedule_with_warmup(\n",
" optimizer,\n",
" num_warmup_steps=WARMUP_STEPS,\n",
" num_training_steps=TOTAL_STEPS\n",
")\n",
"\n",
"loss_fn = torch.nn.MSELoss()\n",
"\n",
"print(f\"Total training steps: {TOTAL_STEPS:,}\")\n",
"print(f\"Warmup steps: {WARMUP_STEPS}\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "a74b65ce-a7cb-48d2-8339-e31b15f58053",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 1/4 [Train]: 100%|██████████| 7794/7794 [27:19<00:00, 4.76it/s, loss=0.2527] \n",
"Epoch 1/4 [Val]: 100%|██████████| 234/234 [00:37<00:00, 6.19it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Epoch 1 Summary:\n",
" Train Loss: 1.2549\n",
" Val Loss: 1.4748 (exp=0.1359, lower=0.7391, upper=0.5999)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save(\"/mnt/folder/file.h5\", base_path=\"/mnt\")\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" ✓ New best model saved (val_loss=1.4748)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 2/4 [Train]: 100%|██████████| 7794/7794 [27:30<00:00, 4.72it/s, loss=0.1510]\n",
"Epoch 2/4 [Val]: 100%|██████████| 234/234 [00:37<00:00, 6.20it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Epoch 2 Summary:\n",
" Train Loss: 0.9742\n",
" Val Loss: 1.4213 (exp=0.1192, lower=0.7282, upper=0.5739)\n",
" ✓ New best model saved (val_loss=1.4213)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 3/4 [Train]: 100%|██████████| 7794/7794 [27:30<00:00, 4.72it/s, loss=0.2923]\n",
"Epoch 3/4 [Val]: 100%|██████████| 234/234 [00:38<00:00, 6.13it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Epoch 3 Summary:\n",
" Train Loss: 0.8794\n",
" Val Loss: 1.3788 (exp=0.1200, lower=0.7105, upper=0.5483)\n",
" ✓ New best model saved (val_loss=1.3788)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 4/4 [Train]: 100%|██████████| 7794/7794 [27:32<00:00, 4.72it/s, loss=0.1689]\n",
"Epoch 4/4 [Val]: 100%|██████████| 234/234 [00:37<00:00, 6.20it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Epoch 4 Summary:\n",
" Train Loss: 0.7895\n",
" Val Loss: 1.3985 (exp=0.1174, lower=0.7197, upper=0.5614)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"text/html": [],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"
Run history:
| epoch | ▁▃▆█ |
| step | ▁▁▁▁▁▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████ |
| train/epoch_loss | █▄▂▁ |
| train/loss | ▆▂▂▆▆▁▃▂▅▂▁▁▁▂▁▂▁▁▁▂▁▁▁▁█▆▁█▂▂▁▁▁▅▁▆▁▁▁ |
| train/loss_exp | █▆▄▆▂▂▂▂▁▁▁▂▁▂▁▁▂▂▁▁▁▂▁▁▁▂▁▁▂▁▁▄▂▁▁▁▁▁ |
| train/loss_lower | ▃ ▃▅▂▁▂█▁▁▁▇▁▁▁▁▂▆▁▁▁█▁▃▃▁▁▁▁▁▁▁▁▁▆▁▁▁ |
| train/loss_upper | ▃▂▇▃▂▅▂▁▂▁▂▂▁▁▇▁▂▁▆▁▂▂▁▂▂█▂▁▁▁▁▁▅█▁▃▁▁▁▁ |
| train/lr | ▅▂▂▁██▇▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁ |
| val/epoch_loss | █▄▁▂ |
| val/loss_exp | █▂▂▁ |
| +2 | ... |
Run summary:
| epoch | 4 |
| step | 31082 |
| train/epoch_loss | 0.7895 |
| train/loss | 0.69179 |
| train/loss_exp | 0.04378 |
| train/loss_lower | 0.05979 |
| train/loss_upper | 0.58822 |
| train/lr | 0.0 |
| val/epoch_loss | 1.39849 |
| val/loss_exp | 0.1174 |
| +2 | ... |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View run deberta-v3-small-run1 at: https://wandb.ai/zaidana-oregon-state-university/job-parser/runs/zmn5c10d
View project at: https://wandb.ai/zaidana-oregon-state-university/job-parser
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Find logs at: ./wandb/run-20260304_235907-zmn5c10d/logs"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Training complete!\n"
]
}
],
"source": [
"import os\n",
"from tqdm import tqdm\n",
"\n",
"CHECKPOINT_DIR = \"/workspace/checkpoints\"\n",
"os.makedirs(CHECKPOINT_DIR, exist_ok=True)\n",
"\n",
"best_val_loss = float(\"inf\")\n",
"\n",
"for epoch in range(EPOCHS):\n",
" # ── TRAIN ──────────────────────────────────────────────\n",
" model.train()\n",
" train_loss = 0\n",
" train_loss_exp = train_loss_lower = train_loss_upper = 0\n",
"\n",
" pbar = tqdm(train_loader, desc=f\"Epoch {epoch+1}/{EPOCHS} [Train]\")\n",
" for step, batch in enumerate(pbar):\n",
" input_ids = batch[\"input_ids\"].to(DEVICE)\n",
" attention_mask = batch[\"attention_mask\"].to(DEVICE)\n",
" token_type_ids = batch[\"token_type_ids\"].to(DEVICE)\n",
" label_exp = batch[\"expected_experience_years\"].to(DEVICE)\n",
" label_lower = batch[\"pay_lower\"].to(DEVICE)\n",
" label_upper = batch[\"pay_upper\"].to(DEVICE)\n",
"\n",
" optimizer.zero_grad()\n",
"\n",
" pred_exp, pred_lower, pred_upper = model(input_ids, attention_mask, token_type_ids)\n",
" loss_exp = loss_fn(pred_exp, label_exp)\n",
" loss_lower = loss_fn(pred_lower, label_lower)\n",
" loss_upper = loss_fn(pred_upper, label_upper)\n",
" loss = loss_exp + loss_lower + loss_upper\n",
"\n",
" loss.backward()\n",
" torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n",
" optimizer.step()\n",
" scheduler.step()\n",
"\n",
" train_loss += loss.item()\n",
" train_loss_exp += loss_exp.item()\n",
" train_loss_lower += loss_lower.item()\n",
" train_loss_upper += loss_upper.item()\n",
"\n",
" # Log to wandb every 100 steps\n",
" if step % 100 == 0:\n",
" wandb.log({\n",
" \"train/loss\": loss.item(),\n",
" \"train/loss_exp\": loss_exp.item(),\n",
" \"train/loss_lower\": loss_lower.item(),\n",
" \"train/loss_upper\": loss_upper.item(),\n",
" \"train/lr\": scheduler.get_last_lr()[0],\n",
" \"step\": epoch * len(train_loader) + step,\n",
" })\n",
"\n",
" pbar.set_postfix({\"loss\": f\"{loss.item():.4f}\"})\n",
"\n",
" avg_train_loss = train_loss / len(train_loader)\n",
"\n",
" # ── VALIDATE ───────────────────────────────────────────\n",
" model.eval()\n",
" val_loss = 0\n",
" val_loss_exp = val_loss_lower = val_loss_upper = 0\n",
"\n",
" with torch.no_grad():\n",
" for batch in tqdm(val_loader, desc=f\"Epoch {epoch+1}/{EPOCHS} [Val]\"):\n",
" input_ids = batch[\"input_ids\"].to(DEVICE)\n",
" attention_mask = batch[\"attention_mask\"].to(DEVICE)\n",
" token_type_ids = batch[\"token_type_ids\"].to(DEVICE)\n",
" label_exp = batch[\"expected_experience_years\"].to(DEVICE)\n",
" label_lower = batch[\"pay_lower\"].to(DEVICE)\n",
" label_upper = batch[\"pay_upper\"].to(DEVICE)\n",
"\n",
" pred_exp, pred_lower, pred_upper = model(input_ids, attention_mask, token_type_ids)\n",
" loss_exp = loss_fn(pred_exp, label_exp)\n",
" loss_lower = loss_fn(pred_lower, label_lower)\n",
" loss_upper = loss_fn(pred_upper, label_upper)\n",
" loss = loss_exp + loss_lower + loss_upper\n",
"\n",
" val_loss += loss.item()\n",
" val_loss_exp += loss_exp.item()\n",
" val_loss_lower += loss_lower.item()\n",
" val_loss_upper += loss_upper.item()\n",
"\n",
" avg_val_loss = val_loss / len(val_loader)\n",
" avg_val_loss_exp = val_loss_exp / len(val_loader)\n",
" avg_val_loss_lower = val_loss_lower / len(val_loader)\n",
" avg_val_loss_upper = val_loss_upper / len(val_loader)\n",
"\n",
" # Log epoch-level metrics to wandb\n",
" wandb.log({\n",
" \"epoch\": epoch + 1,\n",
" \"train/epoch_loss\": avg_train_loss,\n",
" \"val/epoch_loss\": avg_val_loss,\n",
" \"val/loss_exp\": avg_val_loss_exp,\n",
" \"val/loss_lower\": avg_val_loss_lower,\n",
" \"val/loss_upper\": avg_val_loss_upper,\n",
" })\n",
"\n",
" print(f\"\\nEpoch {epoch+1} Summary:\")\n",
" print(f\" Train Loss: {avg_train_loss:.4f}\")\n",
" print(f\" Val Loss: {avg_val_loss:.4f} (exp={avg_val_loss_exp:.4f}, lower={avg_val_loss_lower:.4f}, upper={avg_val_loss_upper:.4f})\")\n",
"\n",
" # ── SAVE BEST CHECKPOINT ───────────────────────────────\n",
" if avg_val_loss < best_val_loss:\n",
" best_val_loss = avg_val_loss\n",
" checkpoint_path = f\"{CHECKPOINT_DIR}/best_model.pt\"\n",
" torch.save({\n",
" \"epoch\": epoch + 1,\n",
" \"model_state_dict\": model.state_dict(),\n",
" \"optimizer_state_dict\": optimizer.state_dict(),\n",
" \"val_loss\": best_val_loss,\n",
" \"norm_stats\": norm_stats,\n",
" }, checkpoint_path)\n",
" print(f\" ✓ New best model saved (val_loss={best_val_loss:.4f})\")\n",
" wandb.save(checkpoint_path)\n",
"\n",
"wandb.finish()\n",
"print(\"\\nTraining complete!\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "9d202c2b-6034-4d8f-94a3-19d147331e31",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded best model from epoch 3 (val_loss=1.3788)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Testing: 100%|██████████| 156/156 [00:25<00:00, 6.08it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"══════════════════════════════════════════\n",
" TEST SET RESULTS\n",
"══════════════════════════════════════════\n",
"\n",
" Experience Years:\n",
" MAE: 0.57 years\n",
" RMSE: 0.95 years\n",
" Within 1 year: 83.1%\n",
" Within 2 years: 95.4%\n",
"\n",
" Pay Lower:\n",
" MAE: $15,511\n",
" RMSE: $45,663\n",
" Within $10k: 65.0%\n",
" Within $20k: 84.5%\n",
"\n",
" Pay Upper:\n",
" MAE: $20,190\n",
" RMSE: $51,005\n",
" Within $10k: 54.4%\n",
" Within $20k: 76.0%\n",
"══════════════════════════════════════════\n"
]
},
{
"data": {
"text/html": [],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Tracking run with wandb version 0.25.0"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Run data is saved locally in /workspace/wandb/run-20260305_022050-yrja68tg"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Syncing run deberta-v3-small-run1 to Weights & Biases (docs)
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View project at https://wandb.ai/zaidana-oregon-state-university/job-parser"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View run at https://wandb.ai/zaidana-oregon-state-university/job-parser/runs/yrja68tg"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"
Run history:
| test/exp_mae_years | ▁ |
| test/exp_rmse_years | ▁ |
| test/exp_within_1yr | ▁ |
| test/exp_within_2yr | ▁ |
| test/lower_mae_usd | ▁ |
| test/lower_rmse_usd | ▁ |
| test/lower_within_10k | ▁ |
| test/lower_within_20k | ▁ |
| test/upper_mae_usd | ▁ |
| test/upper_rmse_usd | ▁ |
| +2 | ... |
Run summary:
| test/exp_mae_years | 0.56634 |
| test/exp_rmse_years | 0.94795 |
| test/exp_within_1yr | 83.12444 |
| test/exp_within_2yr | 95.38755 |
| test/lower_mae_usd | 15511.03421 |
| test/lower_rmse_usd | 45662.90012 |
| test/lower_within_10k | 65.0356 |
| test/lower_within_20k | 84.45804 |
| test/upper_mae_usd | 20189.76591 |
| test/upper_rmse_usd | 51005.47056 |
| +2 | ... |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View run deberta-v3-small-run1 at: https://wandb.ai/zaidana-oregon-state-university/job-parser/runs/yrja68tg
View project at: https://wandb.ai/zaidana-oregon-state-university/job-parser
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Find logs at: ./wandb/run-20260305_022050-yrja68tg/logs"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" Sample Predictions vs Ground Truth (first 5):\n",
" # Exp(pred) Exp(true) Lower(pred) Lower(true) Upper(pred) Upper(true)\n",
" 1 4.9 5.0 $ 75,591 $ 75,000 $ 99,997 $ 95,000\n",
" 2 3.4 2.0 $ 44,687 $ 35,000 $ 59,228 $ 50,000\n",
" 3 5.2 6.0 $ 104,810 $ 100,000 $ 141,871 $ 130,000\n",
" 4 1.1 1.0 $ 29,540 $ 30,000 $ 29,446 $ 30,000\n",
" 5 -0.1 0.0 $ 27,787 $ 30,000 $ 30,070 $ 30,000\n"
]
}
],
"source": [
"import numpy as np\n",
"from tqdm import tqdm\n",
"\n",
"test_loader = DataLoader(dataset_tokenized[\"test\"], batch_size=64, shuffle=False, collate_fn=collate_fn, num_workers=4, pin_memory=True)\n",
"\n",
"# Load best checkpoint\n",
"checkpoint = torch.load(f\"{CHECKPOINT_DIR}/best_model.pt\", map_location=DEVICE, weights_only=False)\n",
"model.load_state_dict(checkpoint[\"model_state_dict\"])\n",
"norm_stats = checkpoint[\"norm_stats\"]\n",
"print(f\"Loaded best model from epoch {checkpoint['epoch']} (val_loss={checkpoint['val_loss']:.4f})\")\n",
"\n",
"# ── RUN TEST SET ───────────────────────────────────────────\n",
"model.eval()\n",
"\n",
"all_pred_exp, all_label_exp = [], []\n",
"all_pred_lower, all_label_lower = [], []\n",
"all_pred_upper, all_label_upper = [], []\n",
"\n",
"with torch.no_grad():\n",
" for batch in tqdm(test_loader, desc=\"Testing\"):\n",
" input_ids = batch[\"input_ids\"].to(DEVICE)\n",
" attention_mask = batch[\"attention_mask\"].to(DEVICE)\n",
" token_type_ids = batch[\"token_type_ids\"].to(DEVICE)\n",
"\n",
" pred_exp, pred_lower, pred_upper = model(input_ids, attention_mask, token_type_ids)\n",
"\n",
" all_pred_exp.append(pred_exp.cpu().numpy())\n",
" all_pred_lower.append(pred_lower.cpu().numpy())\n",
" all_pred_upper.append(pred_upper.cpu().numpy())\n",
"\n",
" all_label_exp.append(batch[\"expected_experience_years\"].numpy())\n",
" all_label_lower.append(batch[\"pay_lower\"].numpy())\n",
" all_label_upper.append(batch[\"pay_upper\"].numpy())\n",
"\n",
"# Flatten\n",
"pred_exp = np.concatenate(all_pred_exp)\n",
"pred_lower = np.concatenate(all_pred_lower)\n",
"pred_upper = np.concatenate(all_pred_upper)\n",
"\n",
"label_exp = np.concatenate(all_label_exp)\n",
"label_lower = np.concatenate(all_label_lower)\n",
"label_upper = np.concatenate(all_label_upper)\n",
"\n",
"# ── DENORMALIZE ────────────────────────────────────────────\n",
"def denorm(arr, col):\n",
" return arr * norm_stats[col][\"std\"] + norm_stats[col][\"mean\"]\n",
"\n",
"pred_exp_real = denorm(pred_exp, \"expected_experience_years\")\n",
"pred_lower_real = denorm(pred_lower, \"pay_lower\")\n",
"pred_upper_real = denorm(pred_upper, \"pay_upper\")\n",
"\n",
"label_exp_real = denorm(label_exp, \"expected_experience_years\")\n",
"label_lower_real = denorm(label_lower, \"pay_lower\")\n",
"label_upper_real = denorm(label_upper, \"pay_upper\")\n",
"\n",
"# ── METRICS ────────────────────────────────────────────────\n",
"def mae(pred, label): return np.mean(np.abs(pred - label))\n",
"def rmse(pred, label): return np.sqrt(np.mean((pred - label) ** 2))\n",
"def within_n(pred, label, n): return np.mean(np.abs(pred - label) <= n) * 100\n",
"\n",
"results = {\n",
" \"exp_mae_years\": mae(pred_exp_real, label_exp_real),\n",
" \"exp_rmse_years\": rmse(pred_exp_real, label_exp_real),\n",
" \"exp_within_1yr\": within_n(pred_exp_real, label_exp_real, 1),\n",
" \"exp_within_2yr\": within_n(pred_exp_real, label_exp_real, 2),\n",
" \"lower_mae_usd\": mae(pred_lower_real, label_lower_real),\n",
" \"lower_rmse_usd\": rmse(pred_lower_real, label_lower_real),\n",
" \"lower_within_10k\": within_n(pred_lower_real, label_lower_real, 10_000),\n",
" \"lower_within_20k\": within_n(pred_lower_real, label_lower_real, 20_000),\n",
" \"upper_mae_usd\": mae(pred_upper_real, label_upper_real),\n",
" \"upper_rmse_usd\": rmse(pred_upper_real, label_upper_real),\n",
" \"upper_within_10k\": within_n(pred_upper_real, label_upper_real, 10_000),\n",
" \"upper_within_20k\": within_n(pred_upper_real, label_upper_real, 20_000),\n",
"}\n",
"\n",
"# ── PRINT ──────────────────────────────────────────────────\n",
"print(\"\\n══════════════════════════════════════════\")\n",
"print(\" TEST SET RESULTS\")\n",
"print(\"══════════════════════════════════════════\")\n",
"print(f\"\\n Experience Years:\")\n",
"print(f\" MAE: {results['exp_mae_years']:.2f} years\")\n",
"print(f\" RMSE: {results['exp_rmse_years']:.2f} years\")\n",
"print(f\" Within 1 year: {results['exp_within_1yr']:.1f}%\")\n",
"print(f\" Within 2 years: {results['exp_within_2yr']:.1f}%\")\n",
"print(f\"\\n Pay Lower:\")\n",
"print(f\" MAE: ${results['lower_mae_usd']:,.0f}\")\n",
"print(f\" RMSE: ${results['lower_rmse_usd']:,.0f}\")\n",
"print(f\" Within $10k: {results['lower_within_10k']:.1f}%\")\n",
"print(f\" Within $20k: {results['lower_within_20k']:.1f}%\")\n",
"print(f\"\\n Pay Upper:\")\n",
"print(f\" MAE: ${results['upper_mae_usd']:,.0f}\")\n",
"print(f\" RMSE: ${results['upper_rmse_usd']:,.0f}\")\n",
"print(f\" Within $10k: {results['upper_within_10k']:.1f}%\")\n",
"print(f\" Within $20k: {results['upper_within_20k']:.1f}%\")\n",
"print(\"══════════════════════════════════════════\")\n",
"\n",
"# ── LOG TO WANDB ───────────────────────────────────────────\n",
"wandb.init(project=\"job-parser\", name=\"deberta-v3-small-run1\", resume=\"allow\")\n",
"wandb.log({\"test/\" + k: v for k, v in results.items()})\n",
"wandb.finish()\n",
"\n",
"# ── SAMPLE PREDICTIONS ─────────────────────────────────────\n",
"print(\"\\n Sample Predictions vs Ground Truth (first 5):\")\n",
"print(f\" {'#':>3} {'Exp(pred)':>10} {'Exp(true)':>10} {'Lower(pred)':>12} {'Lower(true)':>12} {'Upper(pred)':>12} {'Upper(true)':>12}\")\n",
"for i in range(5):\n",
" print(f\" {i+1:>3} {pred_exp_real[i]:>10.1f} {label_exp_real[i]:>10.1f} \"\n",
" f\"${pred_lower_real[i]:>11,.0f} ${label_lower_real[i]:>11,.0f} \"\n",
" f\"${pred_upper_real[i]:>11,.0f} ${label_upper_real[i]:>11,.0f}\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "3521a3c5-ba1f-42e8-a4de-76092b18e025",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"══════════════════════════════════════════\n",
" JOB CLASSIFICATION\n",
"══════════════════════════════════════════\n",
" Title: Manager Software Engineer\n",
" Expected Experience: 6 years\n",
" Salary Range: $117,345 – $154,462\n",
"══════════════════════════════════════════\n"
]
}
],
"source": [
"# ── INFERENCE ──────────────────────────────────────────────\n",
"JOB_TITLE = \"Manager Software Engineer\"\n",
"JOB_DESCRIPTION = \"\"\"\n",
"We are looking for a lead Software Engineer to join our team.\n",
"You will be responsible for designing and implementing scalable backend services.\n",
"Requirements: Strong experience with Python, distributed systems, and cloud infrastructure.\n",
"\"\"\"\n",
"\n",
"# ── RUN THROUGH MODEL ──────────────────────────────────────\n",
"model.eval()\n",
"\n",
"text = f\"[TITLE]: {JOB_TITLE} [DESC]: {JOB_DESCRIPTION.strip()}\"\n",
"\n",
"encoded = tokenizer(\n",
" text,\n",
" padding=\"max_length\",\n",
" truncation=True,\n",
" max_length=MAX_LENGTH,\n",
" return_tensors=\"pt\"\n",
")\n",
"\n",
"input_ids = encoded[\"input_ids\"].to(DEVICE)\n",
"attention_mask = encoded[\"attention_mask\"].to(DEVICE)\n",
"token_type_ids = encoded[\"token_type_ids\"].to(DEVICE)\n",
"\n",
"with torch.no_grad():\n",
" pred_exp, pred_lower, pred_upper = model(input_ids, attention_mask, token_type_ids)\n",
"\n",
"# ── DENORMALIZE ────────────────────────────────────────────\n",
"exp_years = pred_exp.item() * norm_stats[\"expected_experience_years\"][\"std\"] + norm_stats[\"expected_experience_years\"][\"mean\"]\n",
"lower_salary = pred_lower.item() * norm_stats[\"pay_lower\"][\"std\"] + norm_stats[\"pay_lower\"][\"mean\"]\n",
"upper_salary = pred_upper.item() * norm_stats[\"pay_upper\"][\"std\"] + norm_stats[\"pay_upper\"][\"mean\"]\n",
"\n",
"# ── PRINT RESULT ───────────────────────────────────────────\n",
"print(\"══════════════════════════════════════════\")\n",
"print(\" JOB CLASSIFICATION\")\n",
"print(\"══════════════════════════════════════════\")\n",
"print(f\" Title: {JOB_TITLE}\")\n",
"print(f\" Expected Experience: {max(0, round(exp_years))} years\")\n",
"print(f\" Salary Range: ${max(0, lower_salary):,.0f} – ${max(0, upper_salary):,.0f}\")\n",
"print(\"══════════════════════════════════════════\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "b7e2d1f9-4731-4bc5-837b-ced802b3080c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Repo ready: https://huggingface.co/akzaidan/JobPredictor1\n",
"All files saved locally:\n",
" README.md\n",
" config.json\n",
" tokenizer.json\n",
" tokenizer_config.json\n",
" norm_stats.json\n",
" pytorch_model.bin\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8e4dc024014b4fb495c8efe174b7f528",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing Files (0 / 0): | | 0.00B / 0.00B "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "efed4d0283334e059e403eaffcc1e9de",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"New Data Upload: | | 0.00B / 0.00B "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"✓ Model uploaded to: https://huggingface.co/akzaidan/JobPredictor1\n"
]
}
],
"source": [
"import json\n",
"from huggingface_hub import HfApi, create_repo\n",
"\n",
"HF_USERNAME = \"akzaidan\"\n",
"REPO_NAME = \"JobPredictor1\"\n",
"REPO_ID = f\"{HF_USERNAME}/{REPO_NAME}\"\n",
"\n",
"# ── CREATE REPO ────────────────────────────────────────────\n",
"api = create_repo(REPO_ID, repo_type=\"model\", exist_ok=True)\n",
"print(f\"Repo ready: https://huggingface.co/{REPO_ID}\")\n",
"\n",
"# ── SAVE MODEL FILES LOCALLY ───────────────────────────────\n",
"UPLOAD_DIR = \"/workspace/JobPredictor1\"\n",
"os.makedirs(UPLOAD_DIR, exist_ok=True)\n",
"\n",
"# 1. Save model weights\n",
"torch.save(model.state_dict(), f\"{UPLOAD_DIR}/pytorch_model.bin\")\n",
"\n",
"# 2. Save norm_stats (needed for inference)\n",
"with open(f\"{UPLOAD_DIR}/norm_stats.json\", \"w\") as f:\n",
" json.dump(norm_stats, f, indent=2)\n",
"\n",
"# 3. Save tokenizer\n",
"tokenizer.save_pretrained(UPLOAD_DIR)\n",
"\n",
"# 4. Save model config so anyone knows what base model was used\n",
"config_data = {\n",
" \"base_model\": \"microsoft/deberta-v3-small\",\n",
" \"architecture\": \"DeBERTa-v3-small + 3 regression heads\",\n",
" \"outputs\": {\n",
" \"expected_experience_years\": \"integer (years of experience)\",\n",
" \"pay_lower\": \"integer (lower salary bound USD)\",\n",
" \"pay_upper\": \"integer (upper salary bound USD)\"\n",
" },\n",
" \"normalization\": \"z-score — use norm_stats.json to denormalize predictions\",\n",
" \"max_length\": 512,\n",
" \"dropout\": 0.1,\n",
"}\n",
"with open(f\"{UPLOAD_DIR}/config.json\", \"w\") as f:\n",
" json.dump(config_data, f, indent=2)\n",
"\n",
"# 5. Save a model card (README)\n",
"model_card = \"\"\"---\n",
"language: en\n",
"tags:\n",
" - job-classification\n",
" - salary-prediction\n",
" - experience-prediction\n",
" - deberta\n",
"---\n",
"\n",
"# JobPredictor1\n",
"\n",
"Fine-tuned DeBERTa-v3-small model that predicts:\n",
"- **Expected years of experience** required for a job\n",
"- **Lower salary bound** (USD)\n",
"- **Upper salary bound** (USD)\n",
"\n",
"## Input Format\n",
"```\n",
"[TITLE]: [DESC]: \n",
"```\n",
"\n",
"## Outputs\n",
"| Output | Type | Description |\n",
"|---|---|---|\n",
"| expected_experience_years | int | Years of experience required |\n",
"| pay_lower | int | Lower salary bound (USD) |\n",
"| pay_upper | int | Upper salary bound (USD) |\n",
"\n",
"## Normalization\n",
"Predictions are z-score normalized. Use `norm_stats.json` to denormalize:\n",
"```python\n",
"real_value = pred * norm_stats[col][\"std\"] + norm_stats[col][\"mean\"]\n",
"```\n",
"\n",
"## Test Set Performance\n",
"| Metric | Value |\n",
"|---|---|\n",
"| Experience MAE | 0.57 years |\n",
"| Experience Within 1yr | 83.1% |\n",
"| Pay Lower MAE | $15,511 |\n",
"| Pay Lower Within $20k | 84.5% |\n",
"| Pay Upper MAE | $20,190 |\n",
"| Pay Upper Within $20k | 76.0% |\n",
"\n",
"## Base Model\n",
"microsoft/deberta-v3-small\n",
"\"\"\"\n",
"with open(f\"{UPLOAD_DIR}/README.md\", \"w\") as f:\n",
" f.write(model_card)\n",
"\n",
"print(\"All files saved locally:\")\n",
"for fname in os.listdir(UPLOAD_DIR):\n",
" print(f\" {fname}\")\n",
"\n",
"# ── UPLOAD TO HUGGINGFACE ──────────────────────────────────\n",
"api = HfApi()\n",
"api.upload_folder(\n",
" folder_path=UPLOAD_DIR,\n",
" repo_id=REPO_ID,\n",
" repo_type=\"model\"\n",
")\n",
"\n",
"print(f\"\\n✓ Model uploaded to: https://huggingface.co/{REPO_ID}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78c735e3-76aa-4e5b-8e38-02738f8a6e4e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}