Spaces:

PatienceIzere
/

CandoramClassification

Sleeping

File size: 5,534 Bytes

96c59c3

import joblib
import sys
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from utils import extract_deep_features, parse_sections

class YoePredictor:
    def __init__(self):
        print("Loading models into memory...")
        try:
            self.scaler = joblib.load('meta_scaler.pkl')
            self.clf = joblib.load('level_classifier.pkl')
            self.le = joblib.load('label_encoder.pkl')
            self.reg = joblib.load('yoe_regressor.pkl')
            self.sbert = SentenceTransformer('all-MiniLM-L6-v2')
            self.low_conf_threshold = 0.55
            try:
                with open('confidence_config.json', 'r') as f:
                    confidence_cfg = json.load(f)
                    self.low_conf_threshold = float(confidence_cfg.get('low_confidence_threshold', self.low_conf_threshold))
            except Exception:
                # Use default threshold if confidence config is unavailable.
                pass
        except Exception as e:
            print(f"Error loading models. Did you run train_improved.py? Error: {e}")
            sys.exit(1)
            
    def predict(self, title, description, silent=False):
        """

        Predict YOE and review flag for a job description.



        Returns dict:

        - level: predicted experience level label

        - yoe: predicted minimum years of experience

        - extracted_yoe: heuristic extracted minimum years, or None if absent

        - needs_manual_review: True when extraction is missing or model confidence is low

        - reason: short explanation for routing decision

        - confidence: level-classifier confidence (max probability)

        """
        raw_text = (title + " " + description).lower()
        
        deep_feats = extract_deep_features(raw_text)
        
        # SBERT Context Fix
        sections = parse_sections(raw_text)
        sbert_context = sections['requirements'] if len(sections['requirements']) > 100 else raw_text[:1536]
        
        embedding = self.sbert.encode([sbert_context])
        
        meta_cols = [
            'min_yoe_found',
            'max_yoe_found',
            'regex_count',
            'has_explicit_yoe',
            'extraction_quality',
            'in_req_section',
            'has_phd',
            'has_masters',
            'is_manager'
        ]
        meta_vals = [deep_feats[k] for k in meta_cols]
        meta_scaled = self.scaler.transform([meta_vals])
        
        X = np.hstack([embedding, meta_scaled])
        
        level_idx = self.clf.predict(X)[0]
        level_probs = self.clf.predict_proba(X)[0]
        level_confidence = float(np.max(level_probs))
        level = self.le.classes_[level_idx]
        
        yoe_pred = self.reg.predict(X)[0]
        reason_tags = []
        
        # Heuristic overrides
        if deep_feats['min_yoe_found'] > yoe_pred:
            yoe_pred = float(deep_feats['min_yoe_found'])
            reason_tags.append('heuristic_override_to_extracted_min')
            
        yoe_pred = max(0.0, round(yoe_pred, 1))

        if 'intern' in title.lower():
            level = 'entry'
            yoe_pred = 0.0
            reason_tags.append('intern_title_override')

        extracted_yoe = int(deep_feats['min_yoe_found']) if deep_feats['min_yoe_found'] >= 0 else None
        extraction_missing = extracted_yoe is None
        low_confidence = level_confidence < self.low_conf_threshold
        needs_manual_review = extraction_missing or low_confidence
        if extraction_missing:
            reason_tags.append('missing_explicit_yoe_extraction')
        if low_confidence:
            reason_tags.append('low_model_confidence')
        if not reason_tags:
            reason_tags.append('model_prediction_confident')
            
        # Optional: Define a simple static confidence margin since we dropped Quantile Regression for speed
        margin = max(1.0, round(yoe_pred * 0.2, 1)) 

        result = {
            'level': level,
            'yoe': yoe_pred,
            'extracted_yoe': extracted_yoe,
            'needs_manual_review': needs_manual_review,
            'reason': ", ".join(reason_tags),
            'confidence': round(level_confidence, 4),
            'confidence_threshold': round(self.low_conf_threshold, 4)
        }

        if not silent:
            print(f"\n--- PREDICTION REPORT ---")
            print(f"Title: {title}")
            print(f"Experience Level: {result['level'].upper()}")
            print(f"Estimated YOE: {result['yoe']}")
            print(f"Expected Range: {max(0, yoe_pred - margin)} - {yoe_pred + margin} years")
            print(f"Extracted YOE: {result['extracted_yoe']}")
            print(f"Confidence: {result['confidence']} (threshold={result['confidence_threshold']})")
            print(f"Needs Manual Review: {result['needs_manual_review']}")
            print(f"Reason: {result['reason']}")
        
        return result

# Initialize globally if imported by another script
if __name__ == "__main__":
    predictor = YoePredictor()
    
    if len(sys.argv) < 3:
        test_title = "Staff Software Engineer"
        test_desc = "Looking for a technical leader with at least twelve years of industry experience. Founded 5 years ago."
        predictor.predict(test_title, test_desc)
    else:
        predictor.predict(sys.argv[1], sys.argv[2])