CandoramClassification / predict_yoe.py
PatienceIzere's picture
Upload 12 files
96c59c3 verified
import joblib
import sys
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from utils import extract_deep_features, parse_sections
class YoePredictor:
def __init__(self):
print("Loading models into memory...")
try:
self.scaler = joblib.load('meta_scaler.pkl')
self.clf = joblib.load('level_classifier.pkl')
self.le = joblib.load('label_encoder.pkl')
self.reg = joblib.load('yoe_regressor.pkl')
self.sbert = SentenceTransformer('all-MiniLM-L6-v2')
self.low_conf_threshold = 0.55
try:
with open('confidence_config.json', 'r') as f:
confidence_cfg = json.load(f)
self.low_conf_threshold = float(confidence_cfg.get('low_confidence_threshold', self.low_conf_threshold))
except Exception:
# Use default threshold if confidence config is unavailable.
pass
except Exception as e:
print(f"Error loading models. Did you run train_improved.py? Error: {e}")
sys.exit(1)
def predict(self, title, description, silent=False):
"""
Predict YOE and review flag for a job description.
Returns dict:
- level: predicted experience level label
- yoe: predicted minimum years of experience
- extracted_yoe: heuristic extracted minimum years, or None if absent
- needs_manual_review: True when extraction is missing or model confidence is low
- reason: short explanation for routing decision
- confidence: level-classifier confidence (max probability)
"""
raw_text = (title + " " + description).lower()
deep_feats = extract_deep_features(raw_text)
# SBERT Context Fix
sections = parse_sections(raw_text)
sbert_context = sections['requirements'] if len(sections['requirements']) > 100 else raw_text[:1536]
embedding = self.sbert.encode([sbert_context])
meta_cols = [
'min_yoe_found',
'max_yoe_found',
'regex_count',
'has_explicit_yoe',
'extraction_quality',
'in_req_section',
'has_phd',
'has_masters',
'is_manager'
]
meta_vals = [deep_feats[k] for k in meta_cols]
meta_scaled = self.scaler.transform([meta_vals])
X = np.hstack([embedding, meta_scaled])
level_idx = self.clf.predict(X)[0]
level_probs = self.clf.predict_proba(X)[0]
level_confidence = float(np.max(level_probs))
level = self.le.classes_[level_idx]
yoe_pred = self.reg.predict(X)[0]
reason_tags = []
# Heuristic overrides
if deep_feats['min_yoe_found'] > yoe_pred:
yoe_pred = float(deep_feats['min_yoe_found'])
reason_tags.append('heuristic_override_to_extracted_min')
yoe_pred = max(0.0, round(yoe_pred, 1))
if 'intern' in title.lower():
level = 'entry'
yoe_pred = 0.0
reason_tags.append('intern_title_override')
extracted_yoe = int(deep_feats['min_yoe_found']) if deep_feats['min_yoe_found'] >= 0 else None
extraction_missing = extracted_yoe is None
low_confidence = level_confidence < self.low_conf_threshold
needs_manual_review = extraction_missing or low_confidence
if extraction_missing:
reason_tags.append('missing_explicit_yoe_extraction')
if low_confidence:
reason_tags.append('low_model_confidence')
if not reason_tags:
reason_tags.append('model_prediction_confident')
# Optional: Define a simple static confidence margin since we dropped Quantile Regression for speed
margin = max(1.0, round(yoe_pred * 0.2, 1))
result = {
'level': level,
'yoe': yoe_pred,
'extracted_yoe': extracted_yoe,
'needs_manual_review': needs_manual_review,
'reason': ", ".join(reason_tags),
'confidence': round(level_confidence, 4),
'confidence_threshold': round(self.low_conf_threshold, 4)
}
if not silent:
print(f"\n--- PREDICTION REPORT ---")
print(f"Title: {title}")
print(f"Experience Level: {result['level'].upper()}")
print(f"Estimated YOE: {result['yoe']}")
print(f"Expected Range: {max(0, yoe_pred - margin)} - {yoe_pred + margin} years")
print(f"Extracted YOE: {result['extracted_yoe']}")
print(f"Confidence: {result['confidence']} (threshold={result['confidence_threshold']})")
print(f"Needs Manual Review: {result['needs_manual_review']}")
print(f"Reason: {result['reason']}")
return result
# Initialize globally if imported by another script
if __name__ == "__main__":
predictor = YoePredictor()
if len(sys.argv) < 3:
test_title = "Staff Software Engineer"
test_desc = "Looking for a technical leader with at least twelve years of industry experience. Founded 5 years ago."
predictor.predict(test_title, test_desc)
else:
predictor.predict(sys.argv[1], sys.argv[2])