|
|
from pathlib import Path |
|
|
from typing import Any, Dict, List, Optional |
|
|
|
|
|
import joblib |
|
|
import numpy as np |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.config import ( |
|
|
API_CONFIG, |
|
|
DATA_PATHS, |
|
|
EMBEDDING_MODEL_NAME, |
|
|
MODELS_DIR, |
|
|
) |
|
|
from hopcroft_skill_classification_tool_competition.features import clean_github_text |
|
|
|
|
|
|
|
|
class SkillPredictor: |
|
|
""" |
|
|
Skill prediction class that supports both TF-IDF and Embedding-based models. |
|
|
|
|
|
The feature_type determines how text is transformed: |
|
|
- "tfidf": Uses saved TfidfVectorizer |
|
|
- "embedding": Uses SentenceTransformer to generate embeddings |
|
|
""" |
|
|
|
|
|
def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None): |
|
|
""" |
|
|
Initialize the SkillPredictor. |
|
|
|
|
|
Args: |
|
|
model_name: Name of the model file. If None, uses API_CONFIG["model_name"] |
|
|
feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"] |
|
|
""" |
|
|
|
|
|
self.model_name = model_name or API_CONFIG["model_name"] |
|
|
self.feature_type = feature_type or API_CONFIG["feature_type"] |
|
|
|
|
|
self.model_path = MODELS_DIR / self.model_name |
|
|
self.labels_path = MODELS_DIR / "label_names.pkl" |
|
|
|
|
|
|
|
|
self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy" |
|
|
self.kept_indices_path_tfidf = ( |
|
|
Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy" |
|
|
) |
|
|
self.kept_indices_path_emb = ( |
|
|
Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy" |
|
|
) |
|
|
|
|
|
self.model = None |
|
|
self.vectorizer = None |
|
|
self.label_names = None |
|
|
self.kept_indices = None |
|
|
|
|
|
self._load_artifacts() |
|
|
|
|
|
def _load_artifacts(self): |
|
|
"""Load model and required artifacts based on feature_type.""" |
|
|
print(f"Loading model from {self.model_path}...") |
|
|
if not self.model_path.exists(): |
|
|
raise FileNotFoundError(f"Model not found at {self.model_path}") |
|
|
self.model = joblib.load(self.model_path) |
|
|
|
|
|
|
|
|
if self.feature_type == "tfidf": |
|
|
self._load_tfidf_vectorizer() |
|
|
elif self.feature_type == "embedding": |
|
|
self._load_embedding_model() |
|
|
else: |
|
|
raise ValueError( |
|
|
f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'" |
|
|
) |
|
|
|
|
|
|
|
|
print(f"Loading label names from {self.labels_path}...") |
|
|
if not self.labels_path.exists(): |
|
|
raise FileNotFoundError(f"Label names not found at {self.labels_path}") |
|
|
self.label_names = joblib.load(self.labels_path) |
|
|
|
|
|
|
|
|
if self.kept_indices_path_models.exists(): |
|
|
print(f"Loading kept indices from {self.kept_indices_path_models}") |
|
|
self.kept_indices = np.load(self.kept_indices_path_models) |
|
|
elif self.kept_indices_path_emb.exists(): |
|
|
print(f"Loading kept indices from {self.kept_indices_path_emb}") |
|
|
self.kept_indices = np.load(self.kept_indices_path_emb) |
|
|
elif self.kept_indices_path_tfidf.exists(): |
|
|
print(f"Loading kept indices from {self.kept_indices_path_tfidf}") |
|
|
self.kept_indices = np.load(self.kept_indices_path_tfidf) |
|
|
else: |
|
|
print("No kept_label_indices.npy found. Assuming all labels are used.") |
|
|
self.kept_indices = None |
|
|
|
|
|
def _load_tfidf_vectorizer(self): |
|
|
"""Load the TF-IDF vectorizer.""" |
|
|
vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl" |
|
|
print(f"Loading TF-IDF vectorizer from {vectorizer_path}...") |
|
|
if not vectorizer_path.exists(): |
|
|
raise FileNotFoundError( |
|
|
f"TF-IDF vectorizer not found at {vectorizer_path}. " |
|
|
"Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features" |
|
|
) |
|
|
self.vectorizer = joblib.load(vectorizer_path) |
|
|
|
|
|
def _load_embedding_model(self): |
|
|
"""Load the SentenceTransformer model for embeddings.""" |
|
|
try: |
|
|
from sentence_transformers import SentenceTransformer |
|
|
except ImportError as e: |
|
|
raise ImportError( |
|
|
f"sentence-transformers is required for embedding-based models. " |
|
|
f"Install with: pip install sentence-transformers. Error: {e}" |
|
|
) from e |
|
|
|
|
|
print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...") |
|
|
self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME) |
|
|
|
|
|
def _transform_text(self, text: str) -> np.ndarray: |
|
|
""" |
|
|
Transform text to features based on feature_type. |
|
|
|
|
|
Args: |
|
|
text: Cleaned input text |
|
|
|
|
|
Returns: |
|
|
Feature array ready for model prediction |
|
|
""" |
|
|
if self.feature_type == "tfidf": |
|
|
|
|
|
cleaned = clean_github_text(text, use_stemming=True) |
|
|
features = self.vectorizer.transform([cleaned]) |
|
|
return features |
|
|
else: |
|
|
|
|
|
cleaned = clean_github_text(text, use_stemming=False) |
|
|
features = self.vectorizer.encode([cleaned], convert_to_numpy=True) |
|
|
return features |
|
|
|
|
|
def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Predict skills for a given text. |
|
|
|
|
|
Args: |
|
|
text: Input text (issue title + body) |
|
|
threshold: Confidence threshold for binary classification |
|
|
|
|
|
Returns: |
|
|
List of dicts with 'skill_name' and 'confidence' |
|
|
""" |
|
|
|
|
|
features = self._transform_text(text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
probas_list = self.model.predict_proba(features) |
|
|
|
|
|
|
|
|
confidence_scores = [] |
|
|
for i, prob in enumerate(probas_list): |
|
|
if prob.shape[1] >= 2: |
|
|
confidence_scores.append(prob[0][1]) |
|
|
else: |
|
|
|
|
|
try: |
|
|
estimator = self.model.estimators_[i] |
|
|
classes = estimator.classes_ |
|
|
if len(classes) == 1 and classes[0] == 1: |
|
|
confidence_scores.append(1.0) |
|
|
else: |
|
|
confidence_scores.append(0.0) |
|
|
except Exception: |
|
|
confidence_scores.append(0.0) |
|
|
|
|
|
confidence_scores = np.array(confidence_scores) |
|
|
|
|
|
|
|
|
predictions = [] |
|
|
|
|
|
for i, score in enumerate(confidence_scores): |
|
|
if score >= threshold: |
|
|
if self.kept_indices is not None: |
|
|
if i < len(self.kept_indices): |
|
|
original_idx = self.kept_indices[i] |
|
|
skill_name = self.label_names[original_idx] |
|
|
else: |
|
|
continue |
|
|
else: |
|
|
if i < len(self.label_names): |
|
|
skill_name = self.label_names[i] |
|
|
else: |
|
|
skill_name = f"Unknown_Skill_{i}" |
|
|
|
|
|
predictions.append({"skill_name": skill_name, "confidence": float(score)}) |
|
|
|
|
|
|
|
|
predictions.sort(key=lambda x: x["confidence"], reverse=True) |
|
|
|
|
|
return predictions |
|
|
|