from pathlib import Path from typing import Any, Dict, List, Optional import joblib import numpy as np from hopcroft_skill_classification_tool_competition.config import ( API_CONFIG, DATA_PATHS, EMBEDDING_MODEL_NAME, MODELS_DIR, ) from hopcroft_skill_classification_tool_competition.features import clean_github_text class SkillPredictor: """ Skill prediction class that supports both TF-IDF and Embedding-based models. The feature_type determines how text is transformed: - "tfidf": Uses saved TfidfVectorizer - "embedding": Uses SentenceTransformer to generate embeddings """ def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None): """ Initialize the SkillPredictor. Args: model_name: Name of the model file. If None, uses API_CONFIG["model_name"] feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"] """ # Use config defaults if not specified self.model_name = model_name or API_CONFIG["model_name"] self.feature_type = feature_type or API_CONFIG["feature_type"] self.model_path = MODELS_DIR / self.model_name self.labels_path = MODELS_DIR / "label_names.pkl" # Paths for kept indices (may be in different locations) self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy" self.kept_indices_path_tfidf = ( Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy" ) self.kept_indices_path_emb = ( Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy" ) self.model = None self.vectorizer = None # TF-IDF vectorizer or SentenceTransformer self.label_names = None self.kept_indices = None self._load_artifacts() def _load_artifacts(self): """Load model and required artifacts based on feature_type.""" print(f"Loading model from {self.model_path}...") if not self.model_path.exists(): raise FileNotFoundError(f"Model not found at {self.model_path}") self.model = joblib.load(self.model_path) # Load vectorizer/encoder based on feature type if self.feature_type == "tfidf": self._load_tfidf_vectorizer() elif self.feature_type == "embedding": self._load_embedding_model() else: raise ValueError( f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'" ) # Load label names print(f"Loading label names from {self.labels_path}...") if not self.labels_path.exists(): raise FileNotFoundError(f"Label names not found at {self.labels_path}") self.label_names = joblib.load(self.labels_path) # Load kept indices if available if self.kept_indices_path_models.exists(): print(f"Loading kept indices from {self.kept_indices_path_models}") self.kept_indices = np.load(self.kept_indices_path_models) elif self.kept_indices_path_emb.exists(): print(f"Loading kept indices from {self.kept_indices_path_emb}") self.kept_indices = np.load(self.kept_indices_path_emb) elif self.kept_indices_path_tfidf.exists(): print(f"Loading kept indices from {self.kept_indices_path_tfidf}") self.kept_indices = np.load(self.kept_indices_path_tfidf) else: print("No kept_label_indices.npy found. Assuming all labels are used.") self.kept_indices = None def _load_tfidf_vectorizer(self): """Load the TF-IDF vectorizer.""" vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl" print(f"Loading TF-IDF vectorizer from {vectorizer_path}...") if not vectorizer_path.exists(): raise FileNotFoundError( f"TF-IDF vectorizer not found at {vectorizer_path}. " "Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features" ) self.vectorizer = joblib.load(vectorizer_path) def _load_embedding_model(self): """Load the SentenceTransformer model for embeddings.""" try: from sentence_transformers import SentenceTransformer except ImportError as e: raise ImportError( f"sentence-transformers is required for embedding-based models. " f"Install with: pip install sentence-transformers. Error: {e}" ) from e print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...") self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME) def _transform_text(self, text: str) -> np.ndarray: """ Transform text to features based on feature_type. Args: text: Cleaned input text Returns: Feature array ready for model prediction """ if self.feature_type == "tfidf": # TF-IDF: use stemming, return sparse matrix converted to array cleaned = clean_github_text(text, use_stemming=True) features = self.vectorizer.transform([cleaned]) return features else: # Embedding: no stemming (LLMs need full words) cleaned = clean_github_text(text, use_stemming=False) features = self.vectorizer.encode([cleaned], convert_to_numpy=True) return features def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]: """ Predict skills for a given text. Args: text: Input text (issue title + body) threshold: Confidence threshold for binary classification Returns: List of dicts with 'skill_name' and 'confidence' """ # Transform text to features features = self._transform_text(text) # Predict # MultiOutputClassifier predict_proba returns a list of arrays (one per class) # Each array is (n_samples, 2) -> [prob_0, prob_1] probas_list = self.model.predict_proba(features) # Extract positive class probabilities confidence_scores = [] for i, prob in enumerate(probas_list): if prob.shape[1] >= 2: confidence_scores.append(prob[0][1]) else: # Only one class present try: estimator = self.model.estimators_[i] classes = estimator.classes_ if len(classes) == 1 and classes[0] == 1: confidence_scores.append(1.0) else: confidence_scores.append(0.0) except Exception: confidence_scores.append(0.0) confidence_scores = np.array(confidence_scores) # Filter by threshold and map to label names predictions = [] for i, score in enumerate(confidence_scores): if score >= threshold: if self.kept_indices is not None: if i < len(self.kept_indices): original_idx = self.kept_indices[i] skill_name = self.label_names[original_idx] else: continue else: if i < len(self.label_names): skill_name = self.label_names[i] else: skill_name = f"Unknown_Skill_{i}" predictions.append({"skill_name": skill_name, "confidence": float(score)}) # Sort by confidence descending predictions.sort(key=lambda x: x["confidence"], reverse=True) return predictions