DaCrow13
Deploy to HF Spaces (Clean)
225af6a
from pathlib import Path
from typing import Any, Dict, List, Optional
import joblib
import numpy as np
from hopcroft_skill_classification_tool_competition.config import (
API_CONFIG,
DATA_PATHS,
EMBEDDING_MODEL_NAME,
MODELS_DIR,
)
from hopcroft_skill_classification_tool_competition.features import clean_github_text
class SkillPredictor:
"""
Skill prediction class that supports both TF-IDF and Embedding-based models.
The feature_type determines how text is transformed:
- "tfidf": Uses saved TfidfVectorizer
- "embedding": Uses SentenceTransformer to generate embeddings
"""
def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None):
"""
Initialize the SkillPredictor.
Args:
model_name: Name of the model file. If None, uses API_CONFIG["model_name"]
feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"]
"""
# Use config defaults if not specified
self.model_name = model_name or API_CONFIG["model_name"]
self.feature_type = feature_type or API_CONFIG["feature_type"]
self.model_path = MODELS_DIR / self.model_name
self.labels_path = MODELS_DIR / "label_names.pkl"
# Paths for kept indices (may be in different locations)
self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy"
self.kept_indices_path_tfidf = (
Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy"
)
self.kept_indices_path_emb = (
Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy"
)
self.model = None
self.vectorizer = None # TF-IDF vectorizer or SentenceTransformer
self.label_names = None
self.kept_indices = None
self._load_artifacts()
def _load_artifacts(self):
"""Load model and required artifacts based on feature_type."""
print(f"Loading model from {self.model_path}...")
if not self.model_path.exists():
raise FileNotFoundError(f"Model not found at {self.model_path}")
self.model = joblib.load(self.model_path)
# Load vectorizer/encoder based on feature type
if self.feature_type == "tfidf":
self._load_tfidf_vectorizer()
elif self.feature_type == "embedding":
self._load_embedding_model()
else:
raise ValueError(
f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'"
)
# Load label names
print(f"Loading label names from {self.labels_path}...")
if not self.labels_path.exists():
raise FileNotFoundError(f"Label names not found at {self.labels_path}")
self.label_names = joblib.load(self.labels_path)
# Load kept indices if available
if self.kept_indices_path_models.exists():
print(f"Loading kept indices from {self.kept_indices_path_models}")
self.kept_indices = np.load(self.kept_indices_path_models)
elif self.kept_indices_path_emb.exists():
print(f"Loading kept indices from {self.kept_indices_path_emb}")
self.kept_indices = np.load(self.kept_indices_path_emb)
elif self.kept_indices_path_tfidf.exists():
print(f"Loading kept indices from {self.kept_indices_path_tfidf}")
self.kept_indices = np.load(self.kept_indices_path_tfidf)
else:
print("No kept_label_indices.npy found. Assuming all labels are used.")
self.kept_indices = None
def _load_tfidf_vectorizer(self):
"""Load the TF-IDF vectorizer."""
vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
print(f"Loading TF-IDF vectorizer from {vectorizer_path}...")
if not vectorizer_path.exists():
raise FileNotFoundError(
f"TF-IDF vectorizer not found at {vectorizer_path}. "
"Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features"
)
self.vectorizer = joblib.load(vectorizer_path)
def _load_embedding_model(self):
"""Load the SentenceTransformer model for embeddings."""
try:
from sentence_transformers import SentenceTransformer
except ImportError as e:
raise ImportError(
f"sentence-transformers is required for embedding-based models. "
f"Install with: pip install sentence-transformers. Error: {e}"
) from e
print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...")
self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME)
def _transform_text(self, text: str) -> np.ndarray:
"""
Transform text to features based on feature_type.
Args:
text: Cleaned input text
Returns:
Feature array ready for model prediction
"""
if self.feature_type == "tfidf":
# TF-IDF: use stemming, return sparse matrix converted to array
cleaned = clean_github_text(text, use_stemming=True)
features = self.vectorizer.transform([cleaned])
return features
else:
# Embedding: no stemming (LLMs need full words)
cleaned = clean_github_text(text, use_stemming=False)
features = self.vectorizer.encode([cleaned], convert_to_numpy=True)
return features
def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]:
"""
Predict skills for a given text.
Args:
text: Input text (issue title + body)
threshold: Confidence threshold for binary classification
Returns:
List of dicts with 'skill_name' and 'confidence'
"""
# Transform text to features
features = self._transform_text(text)
# Predict
# MultiOutputClassifier predict_proba returns a list of arrays (one per class)
# Each array is (n_samples, 2) -> [prob_0, prob_1]
probas_list = self.model.predict_proba(features)
# Extract positive class probabilities
confidence_scores = []
for i, prob in enumerate(probas_list):
if prob.shape[1] >= 2:
confidence_scores.append(prob[0][1])
else:
# Only one class present
try:
estimator = self.model.estimators_[i]
classes = estimator.classes_
if len(classes) == 1 and classes[0] == 1:
confidence_scores.append(1.0)
else:
confidence_scores.append(0.0)
except Exception:
confidence_scores.append(0.0)
confidence_scores = np.array(confidence_scores)
# Filter by threshold and map to label names
predictions = []
for i, score in enumerate(confidence_scores):
if score >= threshold:
if self.kept_indices is not None:
if i < len(self.kept_indices):
original_idx = self.kept_indices[i]
skill_name = self.label_names[original_idx]
else:
continue
else:
if i < len(self.label_names):
skill_name = self.label_names[i]
else:
skill_name = f"Unknown_Skill_{i}"
predictions.append({"skill_name": skill_name, "confidence": float(score)})
# Sort by confidence descending
predictions.sort(key=lambda x: x["confidence"], reverse=True)
return predictions