File size: 5,534 Bytes
96c59c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import joblib
import sys
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from utils import extract_deep_features, parse_sections

class YoePredictor:
    def __init__(self):
        print("Loading models into memory...")
        try:
            self.scaler = joblib.load('meta_scaler.pkl')
            self.clf = joblib.load('level_classifier.pkl')
            self.le = joblib.load('label_encoder.pkl')
            self.reg = joblib.load('yoe_regressor.pkl')
            self.sbert = SentenceTransformer('all-MiniLM-L6-v2')
            self.low_conf_threshold = 0.55
            try:
                with open('confidence_config.json', 'r') as f:
                    confidence_cfg = json.load(f)
                    self.low_conf_threshold = float(confidence_cfg.get('low_confidence_threshold', self.low_conf_threshold))
            except Exception:
                # Use default threshold if confidence config is unavailable.
                pass
        except Exception as e:
            print(f"Error loading models. Did you run train_improved.py? Error: {e}")
            sys.exit(1)
            
    def predict(self, title, description, silent=False):
        """

        Predict YOE and review flag for a job description.



        Returns dict:

        - level: predicted experience level label

        - yoe: predicted minimum years of experience

        - extracted_yoe: heuristic extracted minimum years, or None if absent

        - needs_manual_review: True when extraction is missing or model confidence is low

        - reason: short explanation for routing decision

        - confidence: level-classifier confidence (max probability)

        """
        raw_text = (title + " " + description).lower()
        
        deep_feats = extract_deep_features(raw_text)
        
        # SBERT Context Fix
        sections = parse_sections(raw_text)
        sbert_context = sections['requirements'] if len(sections['requirements']) > 100 else raw_text[:1536]
        
        embedding = self.sbert.encode([sbert_context])
        
        meta_cols = [
            'min_yoe_found',
            'max_yoe_found',
            'regex_count',
            'has_explicit_yoe',
            'extraction_quality',
            'in_req_section',
            'has_phd',
            'has_masters',
            'is_manager'
        ]
        meta_vals = [deep_feats[k] for k in meta_cols]
        meta_scaled = self.scaler.transform([meta_vals])
        
        X = np.hstack([embedding, meta_scaled])
        
        level_idx = self.clf.predict(X)[0]
        level_probs = self.clf.predict_proba(X)[0]
        level_confidence = float(np.max(level_probs))
        level = self.le.classes_[level_idx]
        
        yoe_pred = self.reg.predict(X)[0]
        reason_tags = []
        
        # Heuristic overrides
        if deep_feats['min_yoe_found'] > yoe_pred:
            yoe_pred = float(deep_feats['min_yoe_found'])
            reason_tags.append('heuristic_override_to_extracted_min')
            
        yoe_pred = max(0.0, round(yoe_pred, 1))

        if 'intern' in title.lower():
            level = 'entry'
            yoe_pred = 0.0
            reason_tags.append('intern_title_override')

        extracted_yoe = int(deep_feats['min_yoe_found']) if deep_feats['min_yoe_found'] >= 0 else None
        extraction_missing = extracted_yoe is None
        low_confidence = level_confidence < self.low_conf_threshold
        needs_manual_review = extraction_missing or low_confidence
        if extraction_missing:
            reason_tags.append('missing_explicit_yoe_extraction')
        if low_confidence:
            reason_tags.append('low_model_confidence')
        if not reason_tags:
            reason_tags.append('model_prediction_confident')
            
        # Optional: Define a simple static confidence margin since we dropped Quantile Regression for speed
        margin = max(1.0, round(yoe_pred * 0.2, 1)) 

        result = {
            'level': level,
            'yoe': yoe_pred,
            'extracted_yoe': extracted_yoe,
            'needs_manual_review': needs_manual_review,
            'reason': ", ".join(reason_tags),
            'confidence': round(level_confidence, 4),
            'confidence_threshold': round(self.low_conf_threshold, 4)
        }

        if not silent:
            print(f"\n--- PREDICTION REPORT ---")
            print(f"Title: {title}")
            print(f"Experience Level: {result['level'].upper()}")
            print(f"Estimated YOE: {result['yoe']}")
            print(f"Expected Range: {max(0, yoe_pred - margin)} - {yoe_pred + margin} years")
            print(f"Extracted YOE: {result['extracted_yoe']}")
            print(f"Confidence: {result['confidence']} (threshold={result['confidence_threshold']})")
            print(f"Needs Manual Review: {result['needs_manual_review']}")
            print(f"Reason: {result['reason']}")
        
        return result

# Initialize globally if imported by another script
if __name__ == "__main__":
    predictor = YoePredictor()
    
    if len(sys.argv) < 3:
        test_title = "Staff Software Engineer"
        test_desc = "Looking for a technical leader with at least twelve years of industry experience. Founded 5 years ago."
        predictor.predict(test_title, test_desc)
    else:
        predictor.predict(sys.argv[1], sys.argv[2])