Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 7,867 Bytes

225af6a

from pathlib import Path
from typing import Any, Dict, List, Optional

import joblib
import numpy as np

from hopcroft_skill_classification_tool_competition.config import (
    API_CONFIG,
    DATA_PATHS,
    EMBEDDING_MODEL_NAME,
    MODELS_DIR,
)
from hopcroft_skill_classification_tool_competition.features import clean_github_text


class SkillPredictor:
    """
    Skill prediction class that supports both TF-IDF and Embedding-based models.

    The feature_type determines how text is transformed:
    - "tfidf": Uses saved TfidfVectorizer
    - "embedding": Uses SentenceTransformer to generate embeddings
    """

    def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None):
        """
        Initialize the SkillPredictor.

        Args:
            model_name: Name of the model file. If None, uses API_CONFIG["model_name"]
            feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"]
        """
        # Use config defaults if not specified
        self.model_name = model_name or API_CONFIG["model_name"]
        self.feature_type = feature_type or API_CONFIG["feature_type"]

        self.model_path = MODELS_DIR / self.model_name
        self.labels_path = MODELS_DIR / "label_names.pkl"

        # Paths for kept indices (may be in different locations)
        self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy"
        self.kept_indices_path_tfidf = (
            Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy"
        )
        self.kept_indices_path_emb = (
            Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy"
        )

        self.model = None
        self.vectorizer = None  # TF-IDF vectorizer or SentenceTransformer
        self.label_names = None
        self.kept_indices = None

        self._load_artifacts()

    def _load_artifacts(self):
        """Load model and required artifacts based on feature_type."""
        print(f"Loading model from {self.model_path}...")
        if not self.model_path.exists():
            raise FileNotFoundError(f"Model not found at {self.model_path}")
        self.model = joblib.load(self.model_path)

        # Load vectorizer/encoder based on feature type
        if self.feature_type == "tfidf":
            self._load_tfidf_vectorizer()
        elif self.feature_type == "embedding":
            self._load_embedding_model()
        else:
            raise ValueError(
                f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'"
            )

        # Load label names
        print(f"Loading label names from {self.labels_path}...")
        if not self.labels_path.exists():
            raise FileNotFoundError(f"Label names not found at {self.labels_path}")
        self.label_names = joblib.load(self.labels_path)

        # Load kept indices if available
        if self.kept_indices_path_models.exists():
            print(f"Loading kept indices from {self.kept_indices_path_models}")
            self.kept_indices = np.load(self.kept_indices_path_models)
        elif self.kept_indices_path_emb.exists():
            print(f"Loading kept indices from {self.kept_indices_path_emb}")
            self.kept_indices = np.load(self.kept_indices_path_emb)
        elif self.kept_indices_path_tfidf.exists():
            print(f"Loading kept indices from {self.kept_indices_path_tfidf}")
            self.kept_indices = np.load(self.kept_indices_path_tfidf)
        else:
            print("No kept_label_indices.npy found. Assuming all labels are used.")
            self.kept_indices = None

    def _load_tfidf_vectorizer(self):
        """Load the TF-IDF vectorizer."""
        vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
        print(f"Loading TF-IDF vectorizer from {vectorizer_path}...")
        if not vectorizer_path.exists():
            raise FileNotFoundError(
                f"TF-IDF vectorizer not found at {vectorizer_path}. "
                "Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features"
            )
        self.vectorizer = joblib.load(vectorizer_path)

    def _load_embedding_model(self):
        """Load the SentenceTransformer model for embeddings."""
        try:
            from sentence_transformers import SentenceTransformer
        except ImportError as e:
            raise ImportError(
                f"sentence-transformers is required for embedding-based models. "
                f"Install with: pip install sentence-transformers. Error: {e}"
            ) from e

        print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...")
        self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME)

    def _transform_text(self, text: str) -> np.ndarray:
        """
        Transform text to features based on feature_type.

        Args:
            text: Cleaned input text

        Returns:
            Feature array ready for model prediction
        """
        if self.feature_type == "tfidf":
            # TF-IDF: use stemming, return sparse matrix converted to array
            cleaned = clean_github_text(text, use_stemming=True)
            features = self.vectorizer.transform([cleaned])
            return features
        else:
            # Embedding: no stemming (LLMs need full words)
            cleaned = clean_github_text(text, use_stemming=False)
            features = self.vectorizer.encode([cleaned], convert_to_numpy=True)
            return features

    def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]:
        """
        Predict skills for a given text.

        Args:
            text: Input text (issue title + body)
            threshold: Confidence threshold for binary classification

        Returns:
            List of dicts with 'skill_name' and 'confidence'
        """
        # Transform text to features
        features = self._transform_text(text)

        # Predict
        # MultiOutputClassifier predict_proba returns a list of arrays (one per class)
        # Each array is (n_samples, 2) -> [prob_0, prob_1]
        probas_list = self.model.predict_proba(features)

        # Extract positive class probabilities
        confidence_scores = []
        for i, prob in enumerate(probas_list):
            if prob.shape[1] >= 2:
                confidence_scores.append(prob[0][1])
            else:
                # Only one class present
                try:
                    estimator = self.model.estimators_[i]
                    classes = estimator.classes_
                    if len(classes) == 1 and classes[0] == 1:
                        confidence_scores.append(1.0)
                    else:
                        confidence_scores.append(0.0)
                except Exception:
                    confidence_scores.append(0.0)

        confidence_scores = np.array(confidence_scores)

        # Filter by threshold and map to label names
        predictions = []

        for i, score in enumerate(confidence_scores):
            if score >= threshold:
                if self.kept_indices is not None:
                    if i < len(self.kept_indices):
                        original_idx = self.kept_indices[i]
                        skill_name = self.label_names[original_idx]
                    else:
                        continue
                else:
                    if i < len(self.label_names):
                        skill_name = self.label_names[i]
                    else:
                        skill_name = f"Unknown_Skill_{i}"

                predictions.append({"skill_name": skill_name, "confidence": float(score)})

        # Sort by confidence descending
        predictions.sort(key=lambda x: x["confidence"], reverse=True)

        return predictions