File size: 5,534 Bytes
96c59c3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | import joblib
import sys
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from utils import extract_deep_features, parse_sections
class YoePredictor:
def __init__(self):
print("Loading models into memory...")
try:
self.scaler = joblib.load('meta_scaler.pkl')
self.clf = joblib.load('level_classifier.pkl')
self.le = joblib.load('label_encoder.pkl')
self.reg = joblib.load('yoe_regressor.pkl')
self.sbert = SentenceTransformer('all-MiniLM-L6-v2')
self.low_conf_threshold = 0.55
try:
with open('confidence_config.json', 'r') as f:
confidence_cfg = json.load(f)
self.low_conf_threshold = float(confidence_cfg.get('low_confidence_threshold', self.low_conf_threshold))
except Exception:
# Use default threshold if confidence config is unavailable.
pass
except Exception as e:
print(f"Error loading models. Did you run train_improved.py? Error: {e}")
sys.exit(1)
def predict(self, title, description, silent=False):
"""
Predict YOE and review flag for a job description.
Returns dict:
- level: predicted experience level label
- yoe: predicted minimum years of experience
- extracted_yoe: heuristic extracted minimum years, or None if absent
- needs_manual_review: True when extraction is missing or model confidence is low
- reason: short explanation for routing decision
- confidence: level-classifier confidence (max probability)
"""
raw_text = (title + " " + description).lower()
deep_feats = extract_deep_features(raw_text)
# SBERT Context Fix
sections = parse_sections(raw_text)
sbert_context = sections['requirements'] if len(sections['requirements']) > 100 else raw_text[:1536]
embedding = self.sbert.encode([sbert_context])
meta_cols = [
'min_yoe_found',
'max_yoe_found',
'regex_count',
'has_explicit_yoe',
'extraction_quality',
'in_req_section',
'has_phd',
'has_masters',
'is_manager'
]
meta_vals = [deep_feats[k] for k in meta_cols]
meta_scaled = self.scaler.transform([meta_vals])
X = np.hstack([embedding, meta_scaled])
level_idx = self.clf.predict(X)[0]
level_probs = self.clf.predict_proba(X)[0]
level_confidence = float(np.max(level_probs))
level = self.le.classes_[level_idx]
yoe_pred = self.reg.predict(X)[0]
reason_tags = []
# Heuristic overrides
if deep_feats['min_yoe_found'] > yoe_pred:
yoe_pred = float(deep_feats['min_yoe_found'])
reason_tags.append('heuristic_override_to_extracted_min')
yoe_pred = max(0.0, round(yoe_pred, 1))
if 'intern' in title.lower():
level = 'entry'
yoe_pred = 0.0
reason_tags.append('intern_title_override')
extracted_yoe = int(deep_feats['min_yoe_found']) if deep_feats['min_yoe_found'] >= 0 else None
extraction_missing = extracted_yoe is None
low_confidence = level_confidence < self.low_conf_threshold
needs_manual_review = extraction_missing or low_confidence
if extraction_missing:
reason_tags.append('missing_explicit_yoe_extraction')
if low_confidence:
reason_tags.append('low_model_confidence')
if not reason_tags:
reason_tags.append('model_prediction_confident')
# Optional: Define a simple static confidence margin since we dropped Quantile Regression for speed
margin = max(1.0, round(yoe_pred * 0.2, 1))
result = {
'level': level,
'yoe': yoe_pred,
'extracted_yoe': extracted_yoe,
'needs_manual_review': needs_manual_review,
'reason': ", ".join(reason_tags),
'confidence': round(level_confidence, 4),
'confidence_threshold': round(self.low_conf_threshold, 4)
}
if not silent:
print(f"\n--- PREDICTION REPORT ---")
print(f"Title: {title}")
print(f"Experience Level: {result['level'].upper()}")
print(f"Estimated YOE: {result['yoe']}")
print(f"Expected Range: {max(0, yoe_pred - margin)} - {yoe_pred + margin} years")
print(f"Extracted YOE: {result['extracted_yoe']}")
print(f"Confidence: {result['confidence']} (threshold={result['confidence_threshold']})")
print(f"Needs Manual Review: {result['needs_manual_review']}")
print(f"Reason: {result['reason']}")
return result
# Initialize globally if imported by another script
if __name__ == "__main__":
predictor = YoePredictor()
if len(sys.argv) < 3:
test_title = "Staff Software Engineer"
test_desc = "Looking for a technical leader with at least twelve years of industry experience. Founded 5 years ago."
predictor.predict(test_title, test_desc)
else:
predictor.predict(sys.argv[1], sys.argv[2]) |