Hopcroft-Skill-Classification

Sleeping

Hopcroft-Skill-Classification / hopcroft_skill_classification_tool_competition /modeling /predict.py

DaCrow13

Deploy to HF Spaces (Clean)

39d224b 2 months ago

7.87 kB

	from pathlib import Path
	from typing import Any, Dict, List, Optional

	import joblib
	import numpy as np

	from hopcroft_skill_classification_tool_competition.config import (
	API_CONFIG,
	DATA_PATHS,
	EMBEDDING_MODEL_NAME,
	MODELS_DIR,
	)
	from hopcroft_skill_classification_tool_competition.features import clean_github_text


	class SkillPredictor:
	"""
	Skill prediction class that supports both TF-IDF and Embedding-based models.

	The feature_type determines how text is transformed:
	- "tfidf": Uses saved TfidfVectorizer
	- "embedding": Uses SentenceTransformer to generate embeddings
	"""

	def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None):
	"""
	Initialize the SkillPredictor.

	Args:
	model_name: Name of the model file. If None, uses API_CONFIG["model_name"]
	feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"]
	"""
	# Use config defaults if not specified
	self.model_name = model_name or API_CONFIG["model_name"]
	self.feature_type = feature_type or API_CONFIG["feature_type"]

	self.model_path = MODELS_DIR / self.model_name
	self.labels_path = MODELS_DIR / "label_names.pkl"

	# Paths for kept indices (may be in different locations)
	self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy"
	self.kept_indices_path_tfidf = (
	Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy"
	)
	self.kept_indices_path_emb = (
	Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy"
	)

	self.model = None
	self.vectorizer = None # TF-IDF vectorizer or SentenceTransformer
	self.label_names = None
	self.kept_indices = None

	self._load_artifacts()

	def _load_artifacts(self):
	"""Load model and required artifacts based on feature_type."""
	print(f"Loading model from {self.model_path}...")
	if not self.model_path.exists():
	raise FileNotFoundError(f"Model not found at {self.model_path}")
	self.model = joblib.load(self.model_path)

	# Load vectorizer/encoder based on feature type
	if self.feature_type == "tfidf":
	self._load_tfidf_vectorizer()
	elif self.feature_type == "embedding":
	self._load_embedding_model()
	else:
	raise ValueError(
	f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'"
	)

	# Load label names
	print(f"Loading label names from {self.labels_path}...")
	if not self.labels_path.exists():
	raise FileNotFoundError(f"Label names not found at {self.labels_path}")
	self.label_names = joblib.load(self.labels_path)

	# Load kept indices if available
	if self.kept_indices_path_models.exists():
	print(f"Loading kept indices from {self.kept_indices_path_models}")
	self.kept_indices = np.load(self.kept_indices_path_models)
	elif self.kept_indices_path_emb.exists():
	print(f"Loading kept indices from {self.kept_indices_path_emb}")
	self.kept_indices = np.load(self.kept_indices_path_emb)
	elif self.kept_indices_path_tfidf.exists():
	print(f"Loading kept indices from {self.kept_indices_path_tfidf}")
	self.kept_indices = np.load(self.kept_indices_path_tfidf)
	else:
	print("No kept_label_indices.npy found. Assuming all labels are used.")
	self.kept_indices = None

	def _load_tfidf_vectorizer(self):
	"""Load the TF-IDF vectorizer."""
	vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
	print(f"Loading TF-IDF vectorizer from {vectorizer_path}...")
	if not vectorizer_path.exists():
	raise FileNotFoundError(
	f"TF-IDF vectorizer not found at {vectorizer_path}. "
	"Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features"
	)
	self.vectorizer = joblib.load(vectorizer_path)

	def _load_embedding_model(self):
	"""Load the SentenceTransformer model for embeddings."""
	try:
	from sentence_transformers import SentenceTransformer
	except ImportError as e:
	raise ImportError(
	f"sentence-transformers is required for embedding-based models. "
	f"Install with: pip install sentence-transformers. Error: {e}"
	) from e

	print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...")
	self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME)

	def _transform_text(self, text: str) -> np.ndarray:
	"""
	Transform text to features based on feature_type.

	Args:
	text: Cleaned input text

	Returns:
	Feature array ready for model prediction
	"""
	if self.feature_type == "tfidf":
	# TF-IDF: use stemming, return sparse matrix converted to array
	cleaned = clean_github_text(text, use_stemming=True)
	features = self.vectorizer.transform([cleaned])
	return features
	else:
	# Embedding: no stemming (LLMs need full words)
	cleaned = clean_github_text(text, use_stemming=False)
	features = self.vectorizer.encode([cleaned], convert_to_numpy=True)
	return features

	def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]:
	"""
	Predict skills for a given text.

	Args:
	text: Input text (issue title + body)
	threshold: Confidence threshold for binary classification

	Returns:
	List of dicts with 'skill_name' and 'confidence'
	"""
	# Transform text to features
	features = self._transform_text(text)

	# Predict
	# MultiOutputClassifier predict_proba returns a list of arrays (one per class)
	# Each array is (n_samples, 2) -> [prob_0, prob_1]
	probas_list = self.model.predict_proba(features)

	# Extract positive class probabilities
	confidence_scores = []
	for i, prob in enumerate(probas_list):
	if prob.shape[1] >= 2:
	confidence_scores.append(prob[0][1])
	else:
	# Only one class present
	try:
	estimator = self.model.estimators_[i]
	classes = estimator.classes_
	if len(classes) == 1 and classes[0] == 1:
	confidence_scores.append(1.0)
	else:
	confidence_scores.append(0.0)
	except Exception:
	confidence_scores.append(0.0)

	confidence_scores = np.array(confidence_scores)

	# Filter by threshold and map to label names
	predictions = []

	for i, score in enumerate(confidence_scores):
	if score >= threshold:
	if self.kept_indices is not None:
	if i < len(self.kept_indices):
	original_idx = self.kept_indices[i]
	skill_name = self.label_names[original_idx]
	else:
	continue
	else:
	if i < len(self.label_names):
	skill_name = self.label_names[i]
	else:
	skill_name = f"Unknown_Skill_{i}"

	predictions.append({"skill_name": skill_name, "confidence": float(score)})

	# Sort by confidence descending
	predictions.sort(key=lambda x: x["confidence"], reverse=True)

	return predictions