import joblib import sys import json import numpy as np from sentence_transformers import SentenceTransformer from utils import extract_deep_features, parse_sections class YoePredictor: def __init__(self): print("Loading models into memory...") try: self.scaler = joblib.load('meta_scaler.pkl') self.clf = joblib.load('level_classifier.pkl') self.le = joblib.load('label_encoder.pkl') self.reg = joblib.load('yoe_regressor.pkl') self.sbert = SentenceTransformer('all-MiniLM-L6-v2') self.low_conf_threshold = 0.55 try: with open('confidence_config.json', 'r') as f: confidence_cfg = json.load(f) self.low_conf_threshold = float(confidence_cfg.get('low_confidence_threshold', self.low_conf_threshold)) except Exception: # Use default threshold if confidence config is unavailable. pass except Exception as e: print(f"Error loading models. Did you run train_improved.py? Error: {e}") sys.exit(1) def predict(self, title, description, silent=False): """ Predict YOE and review flag for a job description. Returns dict: - level: predicted experience level label - yoe: predicted minimum years of experience - extracted_yoe: heuristic extracted minimum years, or None if absent - needs_manual_review: True when extraction is missing or model confidence is low - reason: short explanation for routing decision - confidence: level-classifier confidence (max probability) """ raw_text = (title + " " + description).lower() deep_feats = extract_deep_features(raw_text) # SBERT Context Fix sections = parse_sections(raw_text) sbert_context = sections['requirements'] if len(sections['requirements']) > 100 else raw_text[:1536] embedding = self.sbert.encode([sbert_context]) meta_cols = [ 'min_yoe_found', 'max_yoe_found', 'regex_count', 'has_explicit_yoe', 'extraction_quality', 'in_req_section', 'has_phd', 'has_masters', 'is_manager' ] meta_vals = [deep_feats[k] for k in meta_cols] meta_scaled = self.scaler.transform([meta_vals]) X = np.hstack([embedding, meta_scaled]) level_idx = self.clf.predict(X)[0] level_probs = self.clf.predict_proba(X)[0] level_confidence = float(np.max(level_probs)) level = self.le.classes_[level_idx] yoe_pred = self.reg.predict(X)[0] reason_tags = [] # Heuristic overrides if deep_feats['min_yoe_found'] > yoe_pred: yoe_pred = float(deep_feats['min_yoe_found']) reason_tags.append('heuristic_override_to_extracted_min') yoe_pred = max(0.0, round(yoe_pred, 1)) if 'intern' in title.lower(): level = 'entry' yoe_pred = 0.0 reason_tags.append('intern_title_override') extracted_yoe = int(deep_feats['min_yoe_found']) if deep_feats['min_yoe_found'] >= 0 else None extraction_missing = extracted_yoe is None low_confidence = level_confidence < self.low_conf_threshold needs_manual_review = extraction_missing or low_confidence if extraction_missing: reason_tags.append('missing_explicit_yoe_extraction') if low_confidence: reason_tags.append('low_model_confidence') if not reason_tags: reason_tags.append('model_prediction_confident') # Optional: Define a simple static confidence margin since we dropped Quantile Regression for speed margin = max(1.0, round(yoe_pred * 0.2, 1)) result = { 'level': level, 'yoe': yoe_pred, 'extracted_yoe': extracted_yoe, 'needs_manual_review': needs_manual_review, 'reason': ", ".join(reason_tags), 'confidence': round(level_confidence, 4), 'confidence_threshold': round(self.low_conf_threshold, 4) } if not silent: print(f"\n--- PREDICTION REPORT ---") print(f"Title: {title}") print(f"Experience Level: {result['level'].upper()}") print(f"Estimated YOE: {result['yoe']}") print(f"Expected Range: {max(0, yoe_pred - margin)} - {yoe_pred + margin} years") print(f"Extracted YOE: {result['extracted_yoe']}") print(f"Confidence: {result['confidence']} (threshold={result['confidence_threshold']})") print(f"Needs Manual Review: {result['needs_manual_review']}") print(f"Reason: {result['reason']}") return result # Initialize globally if imported by another script if __name__ == "__main__": predictor = YoePredictor() if len(sys.argv) < 3: test_title = "Staff Software Engineer" test_desc = "Looking for a technical leader with at least twelve years of industry experience. Founded 5 years ago." predictor.predict(test_title, test_desc) else: predictor.predict(sys.argv[1], sys.argv[2])