{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "cea0d83b-7dfe-4bfb-b9ca-6d3114dd3a7a", "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ[\"HF_TOKEN\"] = \"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "257f2b76-59ca-498e-97ac-fdb543d7a47f", "metadata": {}, "outputs": [], "source": [ "pip install datasets huggingface_hub transformers sentencepiece tiktoken protobuf pandas pyarrow wandb" ] }, { "cell_type": "code", "execution_count": 3, "id": "376e8042-91ac-4690-a956-5d598e9677c4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloaded to:\n", " train: /workspace/.cache/huggingface/hub/datasets--akzaidan--Job_Data_Parser/snapshots/30fb9953206f668375dd28297f330bcb909e12cd/train.parquet\n", " val: /workspace/.cache/huggingface/hub/datasets--akzaidan--Job_Data_Parser/snapshots/30fb9953206f668375dd28297f330bcb909e12cd/val.parquet\n", " test: /workspace/.cache/huggingface/hub/datasets--akzaidan--Job_Data_Parser/snapshots/30fb9953206f668375dd28297f330bcb909e12cd/test.parquet\n" ] } ], "source": [ "from huggingface_hub import hf_hub_download\n", "import pandas as pd\n", "\n", "REPO_ID = \"akzaidan/Job_Data_Parser\"\n", "\n", "# Download only the 3 parquet files (no full dataset clone needed)\n", "train_path = hf_hub_download(repo_id=REPO_ID, filename=\"train.parquet\", repo_type=\"dataset\")\n", "val_path = hf_hub_download(repo_id=REPO_ID, filename=\"val.parquet\", repo_type=\"dataset\")\n", "test_path = hf_hub_download(repo_id=REPO_ID, filename=\"test.parquet\", repo_type=\"dataset\")\n", "\n", "print(\"Downloaded to:\")\n", "print(\" train:\", train_path)\n", "print(\" val: \", val_path)\n", "print(\" test: \", test_path)" ] }, { "cell_type": "code", "execution_count": 4, "id": "fccc2393-3220-4e4d-ab03-41702bc00c96", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train: 249,379 rows\n", "Val: 14,964 rows\n", "Test: 9,973 rows\n" ] } ], "source": [ "train_df = pd.read_parquet(train_path)\n", "val_df = pd.read_parquet(val_path)\n", "test_df = pd.read_parquet(test_path)\n", "\n", "print(f\"Train: {len(train_df):,} rows\")\n", "print(f\"Val: {len(val_df):,} rows\")\n", "print(f\"Test: {len(test_df):,} rows\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "e04be49b-1b94-45aa-bb5d-047edb74a7f4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['text', 'pay_lower', 'pay_upper', 'expected_experience_years']\n", "text str\n", "pay_lower int64\n", "pay_upper int64\n", "expected_experience_years int64\n", "dtype: object\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textpay_lowerpay_upperexpected_experience_years
0[TITLE]: Facilities Officer [DESC]: **37 hours...30000300000
1[TITLE]: Floor refinishing technician [DESC]: ...33280374400
2[TITLE]: Registrar in Haematology - Immediate ...800001100003
\n", "
" ], "text/plain": [ " text pay_lower pay_upper \\\n", "0 [TITLE]: Facilities Officer [DESC]: **37 hours... 30000 30000 \n", "1 [TITLE]: Floor refinishing technician [DESC]: ... 33280 37440 \n", "2 [TITLE]: Registrar in Haematology - Immediate ... 80000 110000 \n", "\n", " expected_experience_years \n", "0 0 \n", "1 0 \n", "2 3 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(train_df.columns.tolist())\n", "print(train_df.dtypes)\n", "train_df.head(3)" ] }, { "cell_type": "code", "execution_count": 6, "id": "9b7cc3d4-cff8-480f-b124-e35c88248891", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train after dropping missing: 249,379\n", "Val after dropping missing: 14,964\n", "Test after dropping missing: 9,973\n", "\n", "norm_stats (SAVE THESE):\n", " expected_experience_years: mean=2.8786, std=2.7134\n", " pay_lower: mean=70610.1505, std=54165.9840\n", " pay_upper: mean=92701.2608, std=69885.4088\n", "\n", "Normalized target distributions:\n", " expected_experience_years pay_lower pay_upper\n", "count 249379.000 249379.000 249379.000\n", "mean -0.000 0.000 0.000\n", "std 1.000 1.000 1.000\n", "min -1.061 -0.750 -0.897\n", "25% -0.692 -0.651 -0.683\n", "50% -0.324 -0.196 -0.225\n", "75% 0.782 0.266 0.319\n", "max 6.310 7.927 5.828\n" ] } ], "source": [ "import numpy as np\n", "from datasets import Dataset, DatasetDict\n", "\n", "# Drop rows where ANY label is -1 (missing)\n", "def drop_missing(df):\n", " return df[\n", " (df[\"expected_experience_years\"] != -1) &\n", " (df[\"pay_lower\"] != -1) &\n", " (df[\"pay_upper\"] != -1)\n", " ].reset_index(drop=True)\n", "\n", "train_clean = drop_missing(train_df)\n", "val_clean = drop_missing(val_df)\n", "test_clean = drop_missing(test_df)\n", "\n", "print(f\"Train after dropping missing: {len(train_clean):,}\")\n", "print(f\"Val after dropping missing: {len(val_clean):,}\")\n", "print(f\"Test after dropping missing: {len(test_clean):,}\")\n", "\n", "# Z-score normalize all 3 targets using ONLY train stats\n", "TARGET_COLS = [\"expected_experience_years\", \"pay_lower\", \"pay_upper\"]\n", "\n", "norm_stats = {}\n", "for col in TARGET_COLS:\n", " mean = train_clean[col].mean()\n", " std = train_clean[col].std()\n", " norm_stats[col] = {\"mean\": mean, \"std\": std}\n", "\n", " train_clean[col] = (train_clean[col] - mean) / std\n", " val_clean[col] = (val_clean[col] - mean) / std\n", " test_clean[col] = (test_clean[col] - mean) / std\n", "\n", "# Print stats so you can save them for inference later\n", "print(\"\\nnorm_stats (SAVE THESE):\")\n", "for col, stats in norm_stats.items():\n", " print(f\" {col}: mean={stats['mean']:.4f}, std={stats['std']:.4f}\")\n", "\n", "# Sanity check — all 3 columns should now be centered near 0\n", "print(\"\\nNormalized target distributions:\")\n", "print(train_clean[TARGET_COLS].describe().round(3))\n", "\n", "# Convert to HuggingFace DatasetDict\n", "dataset = DatasetDict({\n", " \"train\": Dataset.from_pandas(train_clean),\n", " \"val\": Dataset.from_pandas(val_clean),\n", " \"test\": Dataset.from_pandas(test_clean),\n", "})" ] }, { "cell_type": "code", "execution_count": 7, "id": "504d2542-3efa-4498-afec-3cd0d9184395", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using device: cuda\n", "GPU: NVIDIA GeForce RTX 5090\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "from transformers import AutoTokenizer, AutoModel\n", "\n", "MODEL_NAME = \"microsoft/deberta-v3-small\"\n", "MAX_LENGTH = 512 # DeBERTa-v3-small's max token length\n", "DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "print(f\"Using device: {DEVICE}\")\n", "print(f\"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "facf872f-9e95-49e8-9a1d-48b875eb4b6a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Torch version: 2.8.0+cu128\n", "CUDA available: True\n", "CUDA version torch was built with: 12.8\n", "Tensor on GPU: tensor([1.], device='cuda:0')\n" ] } ], "source": [ "import torch\n", "\n", "print(f\"Torch version: {torch.__version__}\")\n", "print(f\"CUDA available: {torch.cuda.is_available()}\")\n", "print(f\"CUDA version torch was built with: {torch.version.cuda}\")\n", "\n", "if torch.cuda.is_available():\n", " x = torch.tensor([1.0]).cuda()\n", " print(f\"Tensor on GPU: {x}\")\n", "else:\n", " print(\"CUDA not available — check torch build\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "cf05f57f-affb-4215-95e8-8a095b2c9d69", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dc832c3c14c04c9488e1f3691e3d996a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Tokenizing: 0%| | 0/249379 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

Run history:


epoch▁▃▆█
step▁▁▁▁▁▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/epoch_loss█▄▂▁
train/loss ▆▂▂▆▆▁▃▂▅▂▁▁▁▂▁▂▁▁▁▂▁▁▁▁█▆▁█▂▂▁▁▁▅▁▆▁▁▁
train/loss_exp █▆▄▆▂▂▂▂▁▁▁▂▁▂▁▁▂▂▁▁▁▂▁▁▁▂▁▁▂▁▁▄▂▁▁▁▁▁
train/loss_lower ▃ ▃▅▂▁▂█▁▁▁▇▁▁▁▁▂▆▁▁▁█▁▃▃▁▁▁▁▁▁▁▁▁▆▁▁▁
train/loss_upper▃▂▇▃▂▅▂▁▂▁▂▂▁▁▇▁▂▁▆▁▂▂▁▂▂█▂▁▁▁▁▁▅█▁▃▁▁▁▁
train/lr▅▂▂▁██▇▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁
val/epoch_loss█▄▁▂
val/loss_exp█▂▂▁
+2...

Run summary:


epoch4
step31082
train/epoch_loss0.7895
train/loss0.69179
train/loss_exp0.04378
train/loss_lower0.05979
train/loss_upper0.58822
train/lr0.0
val/epoch_loss1.39849
val/loss_exp0.1174
+2...

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run deberta-v3-small-run1 at: https://wandb.ai/zaidana-oregon-state-university/job-parser/runs/zmn5c10d
View project at: https://wandb.ai/zaidana-oregon-state-university/job-parser
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Find logs at: ./wandb/run-20260304_235907-zmn5c10d/logs" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Training complete!\n" ] } ], "source": [ "import os\n", "from tqdm import tqdm\n", "\n", "CHECKPOINT_DIR = \"/workspace/checkpoints\"\n", "os.makedirs(CHECKPOINT_DIR, exist_ok=True)\n", "\n", "best_val_loss = float(\"inf\")\n", "\n", "for epoch in range(EPOCHS):\n", " # ── TRAIN ──────────────────────────────────────────────\n", " model.train()\n", " train_loss = 0\n", " train_loss_exp = train_loss_lower = train_loss_upper = 0\n", "\n", " pbar = tqdm(train_loader, desc=f\"Epoch {epoch+1}/{EPOCHS} [Train]\")\n", " for step, batch in enumerate(pbar):\n", " input_ids = batch[\"input_ids\"].to(DEVICE)\n", " attention_mask = batch[\"attention_mask\"].to(DEVICE)\n", " token_type_ids = batch[\"token_type_ids\"].to(DEVICE)\n", " label_exp = batch[\"expected_experience_years\"].to(DEVICE)\n", " label_lower = batch[\"pay_lower\"].to(DEVICE)\n", " label_upper = batch[\"pay_upper\"].to(DEVICE)\n", "\n", " optimizer.zero_grad()\n", "\n", " pred_exp, pred_lower, pred_upper = model(input_ids, attention_mask, token_type_ids)\n", " loss_exp = loss_fn(pred_exp, label_exp)\n", " loss_lower = loss_fn(pred_lower, label_lower)\n", " loss_upper = loss_fn(pred_upper, label_upper)\n", " loss = loss_exp + loss_lower + loss_upper\n", "\n", " loss.backward()\n", " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n", " optimizer.step()\n", " scheduler.step()\n", "\n", " train_loss += loss.item()\n", " train_loss_exp += loss_exp.item()\n", " train_loss_lower += loss_lower.item()\n", " train_loss_upper += loss_upper.item()\n", "\n", " # Log to wandb every 100 steps\n", " if step % 100 == 0:\n", " wandb.log({\n", " \"train/loss\": loss.item(),\n", " \"train/loss_exp\": loss_exp.item(),\n", " \"train/loss_lower\": loss_lower.item(),\n", " \"train/loss_upper\": loss_upper.item(),\n", " \"train/lr\": scheduler.get_last_lr()[0],\n", " \"step\": epoch * len(train_loader) + step,\n", " })\n", "\n", " pbar.set_postfix({\"loss\": f\"{loss.item():.4f}\"})\n", "\n", " avg_train_loss = train_loss / len(train_loader)\n", "\n", " # ── VALIDATE ───────────────────────────────────────────\n", " model.eval()\n", " val_loss = 0\n", " val_loss_exp = val_loss_lower = val_loss_upper = 0\n", "\n", " with torch.no_grad():\n", " for batch in tqdm(val_loader, desc=f\"Epoch {epoch+1}/{EPOCHS} [Val]\"):\n", " input_ids = batch[\"input_ids\"].to(DEVICE)\n", " attention_mask = batch[\"attention_mask\"].to(DEVICE)\n", " token_type_ids = batch[\"token_type_ids\"].to(DEVICE)\n", " label_exp = batch[\"expected_experience_years\"].to(DEVICE)\n", " label_lower = batch[\"pay_lower\"].to(DEVICE)\n", " label_upper = batch[\"pay_upper\"].to(DEVICE)\n", "\n", " pred_exp, pred_lower, pred_upper = model(input_ids, attention_mask, token_type_ids)\n", " loss_exp = loss_fn(pred_exp, label_exp)\n", " loss_lower = loss_fn(pred_lower, label_lower)\n", " loss_upper = loss_fn(pred_upper, label_upper)\n", " loss = loss_exp + loss_lower + loss_upper\n", "\n", " val_loss += loss.item()\n", " val_loss_exp += loss_exp.item()\n", " val_loss_lower += loss_lower.item()\n", " val_loss_upper += loss_upper.item()\n", "\n", " avg_val_loss = val_loss / len(val_loader)\n", " avg_val_loss_exp = val_loss_exp / len(val_loader)\n", " avg_val_loss_lower = val_loss_lower / len(val_loader)\n", " avg_val_loss_upper = val_loss_upper / len(val_loader)\n", "\n", " # Log epoch-level metrics to wandb\n", " wandb.log({\n", " \"epoch\": epoch + 1,\n", " \"train/epoch_loss\": avg_train_loss,\n", " \"val/epoch_loss\": avg_val_loss,\n", " \"val/loss_exp\": avg_val_loss_exp,\n", " \"val/loss_lower\": avg_val_loss_lower,\n", " \"val/loss_upper\": avg_val_loss_upper,\n", " })\n", "\n", " print(f\"\\nEpoch {epoch+1} Summary:\")\n", " print(f\" Train Loss: {avg_train_loss:.4f}\")\n", " print(f\" Val Loss: {avg_val_loss:.4f} (exp={avg_val_loss_exp:.4f}, lower={avg_val_loss_lower:.4f}, upper={avg_val_loss_upper:.4f})\")\n", "\n", " # ── SAVE BEST CHECKPOINT ───────────────────────────────\n", " if avg_val_loss < best_val_loss:\n", " best_val_loss = avg_val_loss\n", " checkpoint_path = f\"{CHECKPOINT_DIR}/best_model.pt\"\n", " torch.save({\n", " \"epoch\": epoch + 1,\n", " \"model_state_dict\": model.state_dict(),\n", " \"optimizer_state_dict\": optimizer.state_dict(),\n", " \"val_loss\": best_val_loss,\n", " \"norm_stats\": norm_stats,\n", " }, checkpoint_path)\n", " print(f\" ✓ New best model saved (val_loss={best_val_loss:.4f})\")\n", " wandb.save(checkpoint_path)\n", "\n", "wandb.finish()\n", "print(\"\\nTraining complete!\")" ] }, { "cell_type": "code", "execution_count": 39, "id": "9d202c2b-6034-4d8f-94a3-19d147331e31", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded best model from epoch 3 (val_loss=1.3788)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Testing: 100%|██████████| 156/156 [00:25<00:00, 6.08it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "══════════════════════════════════════════\n", " TEST SET RESULTS\n", "══════════════════════════════════════════\n", "\n", " Experience Years:\n", " MAE: 0.57 years\n", " RMSE: 0.95 years\n", " Within 1 year: 83.1%\n", " Within 2 years: 95.4%\n", "\n", " Pay Lower:\n", " MAE: $15,511\n", " RMSE: $45,663\n", " Within $10k: 65.0%\n", " Within $20k: 84.5%\n", "\n", " Pay Upper:\n", " MAE: $20,190\n", " RMSE: $51,005\n", " Within $10k: 54.4%\n", " Within $20k: 76.0%\n", "══════════════════════════════════════════\n" ] }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Tracking run with wandb version 0.25.0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run data is saved locally in /workspace/wandb/run-20260305_022050-yrja68tg" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Syncing run deberta-v3-small-run1 to Weights & Biases (docs)
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View project at https://wandb.ai/zaidana-oregon-state-university/job-parser" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run at https://wandb.ai/zaidana-oregon-state-university/job-parser/runs/yrja68tg" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

Run history:


test/exp_mae_years
test/exp_rmse_years
test/exp_within_1yr
test/exp_within_2yr
test/lower_mae_usd
test/lower_rmse_usd
test/lower_within_10k
test/lower_within_20k
test/upper_mae_usd
test/upper_rmse_usd
+2...

Run summary:


test/exp_mae_years0.56634
test/exp_rmse_years0.94795
test/exp_within_1yr83.12444
test/exp_within_2yr95.38755
test/lower_mae_usd15511.03421
test/lower_rmse_usd45662.90012
test/lower_within_10k65.0356
test/lower_within_20k84.45804
test/upper_mae_usd20189.76591
test/upper_rmse_usd51005.47056
+2...

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run deberta-v3-small-run1 at: https://wandb.ai/zaidana-oregon-state-university/job-parser/runs/yrja68tg
View project at: https://wandb.ai/zaidana-oregon-state-university/job-parser
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Find logs at: ./wandb/run-20260305_022050-yrja68tg/logs" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", " Sample Predictions vs Ground Truth (first 5):\n", " # Exp(pred) Exp(true) Lower(pred) Lower(true) Upper(pred) Upper(true)\n", " 1 4.9 5.0 $ 75,591 $ 75,000 $ 99,997 $ 95,000\n", " 2 3.4 2.0 $ 44,687 $ 35,000 $ 59,228 $ 50,000\n", " 3 5.2 6.0 $ 104,810 $ 100,000 $ 141,871 $ 130,000\n", " 4 1.1 1.0 $ 29,540 $ 30,000 $ 29,446 $ 30,000\n", " 5 -0.1 0.0 $ 27,787 $ 30,000 $ 30,070 $ 30,000\n" ] } ], "source": [ "import numpy as np\n", "from tqdm import tqdm\n", "\n", "test_loader = DataLoader(dataset_tokenized[\"test\"], batch_size=64, shuffle=False, collate_fn=collate_fn, num_workers=4, pin_memory=True)\n", "\n", "# Load best checkpoint\n", "checkpoint = torch.load(f\"{CHECKPOINT_DIR}/best_model.pt\", map_location=DEVICE, weights_only=False)\n", "model.load_state_dict(checkpoint[\"model_state_dict\"])\n", "norm_stats = checkpoint[\"norm_stats\"]\n", "print(f\"Loaded best model from epoch {checkpoint['epoch']} (val_loss={checkpoint['val_loss']:.4f})\")\n", "\n", "# ── RUN TEST SET ───────────────────────────────────────────\n", "model.eval()\n", "\n", "all_pred_exp, all_label_exp = [], []\n", "all_pred_lower, all_label_lower = [], []\n", "all_pred_upper, all_label_upper = [], []\n", "\n", "with torch.no_grad():\n", " for batch in tqdm(test_loader, desc=\"Testing\"):\n", " input_ids = batch[\"input_ids\"].to(DEVICE)\n", " attention_mask = batch[\"attention_mask\"].to(DEVICE)\n", " token_type_ids = batch[\"token_type_ids\"].to(DEVICE)\n", "\n", " pred_exp, pred_lower, pred_upper = model(input_ids, attention_mask, token_type_ids)\n", "\n", " all_pred_exp.append(pred_exp.cpu().numpy())\n", " all_pred_lower.append(pred_lower.cpu().numpy())\n", " all_pred_upper.append(pred_upper.cpu().numpy())\n", "\n", " all_label_exp.append(batch[\"expected_experience_years\"].numpy())\n", " all_label_lower.append(batch[\"pay_lower\"].numpy())\n", " all_label_upper.append(batch[\"pay_upper\"].numpy())\n", "\n", "# Flatten\n", "pred_exp = np.concatenate(all_pred_exp)\n", "pred_lower = np.concatenate(all_pred_lower)\n", "pred_upper = np.concatenate(all_pred_upper)\n", "\n", "label_exp = np.concatenate(all_label_exp)\n", "label_lower = np.concatenate(all_label_lower)\n", "label_upper = np.concatenate(all_label_upper)\n", "\n", "# ── DENORMALIZE ────────────────────────────────────────────\n", "def denorm(arr, col):\n", " return arr * norm_stats[col][\"std\"] + norm_stats[col][\"mean\"]\n", "\n", "pred_exp_real = denorm(pred_exp, \"expected_experience_years\")\n", "pred_lower_real = denorm(pred_lower, \"pay_lower\")\n", "pred_upper_real = denorm(pred_upper, \"pay_upper\")\n", "\n", "label_exp_real = denorm(label_exp, \"expected_experience_years\")\n", "label_lower_real = denorm(label_lower, \"pay_lower\")\n", "label_upper_real = denorm(label_upper, \"pay_upper\")\n", "\n", "# ── METRICS ────────────────────────────────────────────────\n", "def mae(pred, label): return np.mean(np.abs(pred - label))\n", "def rmse(pred, label): return np.sqrt(np.mean((pred - label) ** 2))\n", "def within_n(pred, label, n): return np.mean(np.abs(pred - label) <= n) * 100\n", "\n", "results = {\n", " \"exp_mae_years\": mae(pred_exp_real, label_exp_real),\n", " \"exp_rmse_years\": rmse(pred_exp_real, label_exp_real),\n", " \"exp_within_1yr\": within_n(pred_exp_real, label_exp_real, 1),\n", " \"exp_within_2yr\": within_n(pred_exp_real, label_exp_real, 2),\n", " \"lower_mae_usd\": mae(pred_lower_real, label_lower_real),\n", " \"lower_rmse_usd\": rmse(pred_lower_real, label_lower_real),\n", " \"lower_within_10k\": within_n(pred_lower_real, label_lower_real, 10_000),\n", " \"lower_within_20k\": within_n(pred_lower_real, label_lower_real, 20_000),\n", " \"upper_mae_usd\": mae(pred_upper_real, label_upper_real),\n", " \"upper_rmse_usd\": rmse(pred_upper_real, label_upper_real),\n", " \"upper_within_10k\": within_n(pred_upper_real, label_upper_real, 10_000),\n", " \"upper_within_20k\": within_n(pred_upper_real, label_upper_real, 20_000),\n", "}\n", "\n", "# ── PRINT ──────────────────────────────────────────────────\n", "print(\"\\n══════════════════════════════════════════\")\n", "print(\" TEST SET RESULTS\")\n", "print(\"══════════════════════════════════════════\")\n", "print(f\"\\n Experience Years:\")\n", "print(f\" MAE: {results['exp_mae_years']:.2f} years\")\n", "print(f\" RMSE: {results['exp_rmse_years']:.2f} years\")\n", "print(f\" Within 1 year: {results['exp_within_1yr']:.1f}%\")\n", "print(f\" Within 2 years: {results['exp_within_2yr']:.1f}%\")\n", "print(f\"\\n Pay Lower:\")\n", "print(f\" MAE: ${results['lower_mae_usd']:,.0f}\")\n", "print(f\" RMSE: ${results['lower_rmse_usd']:,.0f}\")\n", "print(f\" Within $10k: {results['lower_within_10k']:.1f}%\")\n", "print(f\" Within $20k: {results['lower_within_20k']:.1f}%\")\n", "print(f\"\\n Pay Upper:\")\n", "print(f\" MAE: ${results['upper_mae_usd']:,.0f}\")\n", "print(f\" RMSE: ${results['upper_rmse_usd']:,.0f}\")\n", "print(f\" Within $10k: {results['upper_within_10k']:.1f}%\")\n", "print(f\" Within $20k: {results['upper_within_20k']:.1f}%\")\n", "print(\"══════════════════════════════════════════\")\n", "\n", "# ── LOG TO WANDB ───────────────────────────────────────────\n", "wandb.init(project=\"job-parser\", name=\"deberta-v3-small-run1\", resume=\"allow\")\n", "wandb.log({\"test/\" + k: v for k, v in results.items()})\n", "wandb.finish()\n", "\n", "# ── SAMPLE PREDICTIONS ─────────────────────────────────────\n", "print(\"\\n Sample Predictions vs Ground Truth (first 5):\")\n", "print(f\" {'#':>3} {'Exp(pred)':>10} {'Exp(true)':>10} {'Lower(pred)':>12} {'Lower(true)':>12} {'Upper(pred)':>12} {'Upper(true)':>12}\")\n", "for i in range(5):\n", " print(f\" {i+1:>3} {pred_exp_real[i]:>10.1f} {label_exp_real[i]:>10.1f} \"\n", " f\"${pred_lower_real[i]:>11,.0f} ${label_lower_real[i]:>11,.0f} \"\n", " f\"${pred_upper_real[i]:>11,.0f} ${label_upper_real[i]:>11,.0f}\")" ] }, { "cell_type": "code", "execution_count": 46, "id": "3521a3c5-ba1f-42e8-a4de-76092b18e025", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "══════════════════════════════════════════\n", " JOB CLASSIFICATION\n", "══════════════════════════════════════════\n", " Title: Manager Software Engineer\n", " Expected Experience: 6 years\n", " Salary Range: $117,345 – $154,462\n", "══════════════════════════════════════════\n" ] } ], "source": [ "# ── INFERENCE ──────────────────────────────────────────────\n", "JOB_TITLE = \"Manager Software Engineer\"\n", "JOB_DESCRIPTION = \"\"\"\n", "We are looking for a lead Software Engineer to join our team.\n", "You will be responsible for designing and implementing scalable backend services.\n", "Requirements: Strong experience with Python, distributed systems, and cloud infrastructure.\n", "\"\"\"\n", "\n", "# ── RUN THROUGH MODEL ──────────────────────────────────────\n", "model.eval()\n", "\n", "text = f\"[TITLE]: {JOB_TITLE} [DESC]: {JOB_DESCRIPTION.strip()}\"\n", "\n", "encoded = tokenizer(\n", " text,\n", " padding=\"max_length\",\n", " truncation=True,\n", " max_length=MAX_LENGTH,\n", " return_tensors=\"pt\"\n", ")\n", "\n", "input_ids = encoded[\"input_ids\"].to(DEVICE)\n", "attention_mask = encoded[\"attention_mask\"].to(DEVICE)\n", "token_type_ids = encoded[\"token_type_ids\"].to(DEVICE)\n", "\n", "with torch.no_grad():\n", " pred_exp, pred_lower, pred_upper = model(input_ids, attention_mask, token_type_ids)\n", "\n", "# ── DENORMALIZE ────────────────────────────────────────────\n", "exp_years = pred_exp.item() * norm_stats[\"expected_experience_years\"][\"std\"] + norm_stats[\"expected_experience_years\"][\"mean\"]\n", "lower_salary = pred_lower.item() * norm_stats[\"pay_lower\"][\"std\"] + norm_stats[\"pay_lower\"][\"mean\"]\n", "upper_salary = pred_upper.item() * norm_stats[\"pay_upper\"][\"std\"] + norm_stats[\"pay_upper\"][\"mean\"]\n", "\n", "# ── PRINT RESULT ───────────────────────────────────────────\n", "print(\"══════════════════════════════════════════\")\n", "print(\" JOB CLASSIFICATION\")\n", "print(\"══════════════════════════════════════════\")\n", "print(f\" Title: {JOB_TITLE}\")\n", "print(f\" Expected Experience: {max(0, round(exp_years))} years\")\n", "print(f\" Salary Range: ${max(0, lower_salary):,.0f} – ${max(0, upper_salary):,.0f}\")\n", "print(\"══════════════════════════════════════════\")" ] }, { "cell_type": "code", "execution_count": 47, "id": "b7e2d1f9-4731-4bc5-837b-ced802b3080c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Repo ready: https://huggingface.co/akzaidan/JobPredictor1\n", "All files saved locally:\n", " README.md\n", " config.json\n", " tokenizer.json\n", " tokenizer_config.json\n", " norm_stats.json\n", " pytorch_model.bin\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8e4dc024014b4fb495c8efe174b7f528", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Processing Files (0 / 0): | | 0.00B / 0.00B " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "efed4d0283334e059e403eaffcc1e9de", "version_major": 2, "version_minor": 0 }, "text/plain": [ "New Data Upload: | | 0.00B / 0.00B " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Model uploaded to: https://huggingface.co/akzaidan/JobPredictor1\n" ] } ], "source": [ "import json\n", "from huggingface_hub import HfApi, create_repo\n", "\n", "HF_USERNAME = \"akzaidan\"\n", "REPO_NAME = \"JobPredictor1\"\n", "REPO_ID = f\"{HF_USERNAME}/{REPO_NAME}\"\n", "\n", "# ── CREATE REPO ────────────────────────────────────────────\n", "api = create_repo(REPO_ID, repo_type=\"model\", exist_ok=True)\n", "print(f\"Repo ready: https://huggingface.co/{REPO_ID}\")\n", "\n", "# ── SAVE MODEL FILES LOCALLY ───────────────────────────────\n", "UPLOAD_DIR = \"/workspace/JobPredictor1\"\n", "os.makedirs(UPLOAD_DIR, exist_ok=True)\n", "\n", "# 1. Save model weights\n", "torch.save(model.state_dict(), f\"{UPLOAD_DIR}/pytorch_model.bin\")\n", "\n", "# 2. Save norm_stats (needed for inference)\n", "with open(f\"{UPLOAD_DIR}/norm_stats.json\", \"w\") as f:\n", " json.dump(norm_stats, f, indent=2)\n", "\n", "# 3. Save tokenizer\n", "tokenizer.save_pretrained(UPLOAD_DIR)\n", "\n", "# 4. Save model config so anyone knows what base model was used\n", "config_data = {\n", " \"base_model\": \"microsoft/deberta-v3-small\",\n", " \"architecture\": \"DeBERTa-v3-small + 3 regression heads\",\n", " \"outputs\": {\n", " \"expected_experience_years\": \"integer (years of experience)\",\n", " \"pay_lower\": \"integer (lower salary bound USD)\",\n", " \"pay_upper\": \"integer (upper salary bound USD)\"\n", " },\n", " \"normalization\": \"z-score — use norm_stats.json to denormalize predictions\",\n", " \"max_length\": 512,\n", " \"dropout\": 0.1,\n", "}\n", "with open(f\"{UPLOAD_DIR}/config.json\", \"w\") as f:\n", " json.dump(config_data, f, indent=2)\n", "\n", "# 5. Save a model card (README)\n", "model_card = \"\"\"---\n", "language: en\n", "tags:\n", " - job-classification\n", " - salary-prediction\n", " - experience-prediction\n", " - deberta\n", "---\n", "\n", "# JobPredictor1\n", "\n", "Fine-tuned DeBERTa-v3-small model that predicts:\n", "- **Expected years of experience** required for a job\n", "- **Lower salary bound** (USD)\n", "- **Upper salary bound** (USD)\n", "\n", "## Input Format\n", "```\n", "[TITLE]: [DESC]: \n", "```\n", "\n", "## Outputs\n", "| Output | Type | Description |\n", "|---|---|---|\n", "| expected_experience_years | int | Years of experience required |\n", "| pay_lower | int | Lower salary bound (USD) |\n", "| pay_upper | int | Upper salary bound (USD) |\n", "\n", "## Normalization\n", "Predictions are z-score normalized. Use `norm_stats.json` to denormalize:\n", "```python\n", "real_value = pred * norm_stats[col][\"std\"] + norm_stats[col][\"mean\"]\n", "```\n", "\n", "## Test Set Performance\n", "| Metric | Value |\n", "|---|---|\n", "| Experience MAE | 0.57 years |\n", "| Experience Within 1yr | 83.1% |\n", "| Pay Lower MAE | $15,511 |\n", "| Pay Lower Within $20k | 84.5% |\n", "| Pay Upper MAE | $20,190 |\n", "| Pay Upper Within $20k | 76.0% |\n", "\n", "## Base Model\n", "microsoft/deberta-v3-small\n", "\"\"\"\n", "with open(f\"{UPLOAD_DIR}/README.md\", \"w\") as f:\n", " f.write(model_card)\n", "\n", "print(\"All files saved locally:\")\n", "for fname in os.listdir(UPLOAD_DIR):\n", " print(f\" {fname}\")\n", "\n", "# ── UPLOAD TO HUGGINGFACE ──────────────────────────────────\n", "api = HfApi()\n", "api.upload_folder(\n", " folder_path=UPLOAD_DIR,\n", " repo_id=REPO_ID,\n", " repo_type=\"model\"\n", ")\n", "\n", "print(f\"\\n✓ Model uploaded to: https://huggingface.co/{REPO_ID}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "78c735e3-76aa-4e5b-8e38-02738f8a6e4e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }