File size: 7,867 Bytes
225af6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
from pathlib import Path
from typing import Any, Dict, List, Optional
import joblib
import numpy as np
from hopcroft_skill_classification_tool_competition.config import (
API_CONFIG,
DATA_PATHS,
EMBEDDING_MODEL_NAME,
MODELS_DIR,
)
from hopcroft_skill_classification_tool_competition.features import clean_github_text
class SkillPredictor:
"""
Skill prediction class that supports both TF-IDF and Embedding-based models.
The feature_type determines how text is transformed:
- "tfidf": Uses saved TfidfVectorizer
- "embedding": Uses SentenceTransformer to generate embeddings
"""
def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None):
"""
Initialize the SkillPredictor.
Args:
model_name: Name of the model file. If None, uses API_CONFIG["model_name"]
feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"]
"""
# Use config defaults if not specified
self.model_name = model_name or API_CONFIG["model_name"]
self.feature_type = feature_type or API_CONFIG["feature_type"]
self.model_path = MODELS_DIR / self.model_name
self.labels_path = MODELS_DIR / "label_names.pkl"
# Paths for kept indices (may be in different locations)
self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy"
self.kept_indices_path_tfidf = (
Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy"
)
self.kept_indices_path_emb = (
Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy"
)
self.model = None
self.vectorizer = None # TF-IDF vectorizer or SentenceTransformer
self.label_names = None
self.kept_indices = None
self._load_artifacts()
def _load_artifacts(self):
"""Load model and required artifacts based on feature_type."""
print(f"Loading model from {self.model_path}...")
if not self.model_path.exists():
raise FileNotFoundError(f"Model not found at {self.model_path}")
self.model = joblib.load(self.model_path)
# Load vectorizer/encoder based on feature type
if self.feature_type == "tfidf":
self._load_tfidf_vectorizer()
elif self.feature_type == "embedding":
self._load_embedding_model()
else:
raise ValueError(
f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'"
)
# Load label names
print(f"Loading label names from {self.labels_path}...")
if not self.labels_path.exists():
raise FileNotFoundError(f"Label names not found at {self.labels_path}")
self.label_names = joblib.load(self.labels_path)
# Load kept indices if available
if self.kept_indices_path_models.exists():
print(f"Loading kept indices from {self.kept_indices_path_models}")
self.kept_indices = np.load(self.kept_indices_path_models)
elif self.kept_indices_path_emb.exists():
print(f"Loading kept indices from {self.kept_indices_path_emb}")
self.kept_indices = np.load(self.kept_indices_path_emb)
elif self.kept_indices_path_tfidf.exists():
print(f"Loading kept indices from {self.kept_indices_path_tfidf}")
self.kept_indices = np.load(self.kept_indices_path_tfidf)
else:
print("No kept_label_indices.npy found. Assuming all labels are used.")
self.kept_indices = None
def _load_tfidf_vectorizer(self):
"""Load the TF-IDF vectorizer."""
vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
print(f"Loading TF-IDF vectorizer from {vectorizer_path}...")
if not vectorizer_path.exists():
raise FileNotFoundError(
f"TF-IDF vectorizer not found at {vectorizer_path}. "
"Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features"
)
self.vectorizer = joblib.load(vectorizer_path)
def _load_embedding_model(self):
"""Load the SentenceTransformer model for embeddings."""
try:
from sentence_transformers import SentenceTransformer
except ImportError as e:
raise ImportError(
f"sentence-transformers is required for embedding-based models. "
f"Install with: pip install sentence-transformers. Error: {e}"
) from e
print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...")
self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME)
def _transform_text(self, text: str) -> np.ndarray:
"""
Transform text to features based on feature_type.
Args:
text: Cleaned input text
Returns:
Feature array ready for model prediction
"""
if self.feature_type == "tfidf":
# TF-IDF: use stemming, return sparse matrix converted to array
cleaned = clean_github_text(text, use_stemming=True)
features = self.vectorizer.transform([cleaned])
return features
else:
# Embedding: no stemming (LLMs need full words)
cleaned = clean_github_text(text, use_stemming=False)
features = self.vectorizer.encode([cleaned], convert_to_numpy=True)
return features
def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]:
"""
Predict skills for a given text.
Args:
text: Input text (issue title + body)
threshold: Confidence threshold for binary classification
Returns:
List of dicts with 'skill_name' and 'confidence'
"""
# Transform text to features
features = self._transform_text(text)
# Predict
# MultiOutputClassifier predict_proba returns a list of arrays (one per class)
# Each array is (n_samples, 2) -> [prob_0, prob_1]
probas_list = self.model.predict_proba(features)
# Extract positive class probabilities
confidence_scores = []
for i, prob in enumerate(probas_list):
if prob.shape[1] >= 2:
confidence_scores.append(prob[0][1])
else:
# Only one class present
try:
estimator = self.model.estimators_[i]
classes = estimator.classes_
if len(classes) == 1 and classes[0] == 1:
confidence_scores.append(1.0)
else:
confidence_scores.append(0.0)
except Exception:
confidence_scores.append(0.0)
confidence_scores = np.array(confidence_scores)
# Filter by threshold and map to label names
predictions = []
for i, score in enumerate(confidence_scores):
if score >= threshold:
if self.kept_indices is not None:
if i < len(self.kept_indices):
original_idx = self.kept_indices[i]
skill_name = self.label_names[original_idx]
else:
continue
else:
if i < len(self.label_names):
skill_name = self.label_names[i]
else:
skill_name = f"Unknown_Skill_{i}"
predictions.append({"skill_name": skill_name, "confidence": float(score)})
# Sort by confidence descending
predictions.sort(key=lambda x: x["confidence"], reverse=True)
return predictions
|