File size: 7,867 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from pathlib import Path
from typing import Any, Dict, List, Optional

import joblib
import numpy as np

from hopcroft_skill_classification_tool_competition.config import (
    API_CONFIG,
    DATA_PATHS,
    EMBEDDING_MODEL_NAME,
    MODELS_DIR,
)
from hopcroft_skill_classification_tool_competition.features import clean_github_text


class SkillPredictor:
    """
    Skill prediction class that supports both TF-IDF and Embedding-based models.

    The feature_type determines how text is transformed:
    - "tfidf": Uses saved TfidfVectorizer
    - "embedding": Uses SentenceTransformer to generate embeddings
    """

    def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None):
        """
        Initialize the SkillPredictor.

        Args:
            model_name: Name of the model file. If None, uses API_CONFIG["model_name"]
            feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"]
        """
        # Use config defaults if not specified
        self.model_name = model_name or API_CONFIG["model_name"]
        self.feature_type = feature_type or API_CONFIG["feature_type"]

        self.model_path = MODELS_DIR / self.model_name
        self.labels_path = MODELS_DIR / "label_names.pkl"

        # Paths for kept indices (may be in different locations)
        self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy"
        self.kept_indices_path_tfidf = (
            Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy"
        )
        self.kept_indices_path_emb = (
            Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy"
        )

        self.model = None
        self.vectorizer = None  # TF-IDF vectorizer or SentenceTransformer
        self.label_names = None
        self.kept_indices = None

        self._load_artifacts()

    def _load_artifacts(self):
        """Load model and required artifacts based on feature_type."""
        print(f"Loading model from {self.model_path}...")
        if not self.model_path.exists():
            raise FileNotFoundError(f"Model not found at {self.model_path}")
        self.model = joblib.load(self.model_path)

        # Load vectorizer/encoder based on feature type
        if self.feature_type == "tfidf":
            self._load_tfidf_vectorizer()
        elif self.feature_type == "embedding":
            self._load_embedding_model()
        else:
            raise ValueError(
                f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'"
            )

        # Load label names
        print(f"Loading label names from {self.labels_path}...")
        if not self.labels_path.exists():
            raise FileNotFoundError(f"Label names not found at {self.labels_path}")
        self.label_names = joblib.load(self.labels_path)

        # Load kept indices if available
        if self.kept_indices_path_models.exists():
            print(f"Loading kept indices from {self.kept_indices_path_models}")
            self.kept_indices = np.load(self.kept_indices_path_models)
        elif self.kept_indices_path_emb.exists():
            print(f"Loading kept indices from {self.kept_indices_path_emb}")
            self.kept_indices = np.load(self.kept_indices_path_emb)
        elif self.kept_indices_path_tfidf.exists():
            print(f"Loading kept indices from {self.kept_indices_path_tfidf}")
            self.kept_indices = np.load(self.kept_indices_path_tfidf)
        else:
            print("No kept_label_indices.npy found. Assuming all labels are used.")
            self.kept_indices = None

    def _load_tfidf_vectorizer(self):
        """Load the TF-IDF vectorizer."""
        vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
        print(f"Loading TF-IDF vectorizer from {vectorizer_path}...")
        if not vectorizer_path.exists():
            raise FileNotFoundError(
                f"TF-IDF vectorizer not found at {vectorizer_path}. "
                "Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features"
            )
        self.vectorizer = joblib.load(vectorizer_path)

    def _load_embedding_model(self):
        """Load the SentenceTransformer model for embeddings."""
        try:
            from sentence_transformers import SentenceTransformer
        except ImportError as e:
            raise ImportError(
                f"sentence-transformers is required for embedding-based models. "
                f"Install with: pip install sentence-transformers. Error: {e}"
            ) from e

        print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...")
        self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME)

    def _transform_text(self, text: str) -> np.ndarray:
        """
        Transform text to features based on feature_type.

        Args:
            text: Cleaned input text

        Returns:
            Feature array ready for model prediction
        """
        if self.feature_type == "tfidf":
            # TF-IDF: use stemming, return sparse matrix converted to array
            cleaned = clean_github_text(text, use_stemming=True)
            features = self.vectorizer.transform([cleaned])
            return features
        else:
            # Embedding: no stemming (LLMs need full words)
            cleaned = clean_github_text(text, use_stemming=False)
            features = self.vectorizer.encode([cleaned], convert_to_numpy=True)
            return features

    def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]:
        """
        Predict skills for a given text.

        Args:
            text: Input text (issue title + body)
            threshold: Confidence threshold for binary classification

        Returns:
            List of dicts with 'skill_name' and 'confidence'
        """
        # Transform text to features
        features = self._transform_text(text)

        # Predict
        # MultiOutputClassifier predict_proba returns a list of arrays (one per class)
        # Each array is (n_samples, 2) -> [prob_0, prob_1]
        probas_list = self.model.predict_proba(features)

        # Extract positive class probabilities
        confidence_scores = []
        for i, prob in enumerate(probas_list):
            if prob.shape[1] >= 2:
                confidence_scores.append(prob[0][1])
            else:
                # Only one class present
                try:
                    estimator = self.model.estimators_[i]
                    classes = estimator.classes_
                    if len(classes) == 1 and classes[0] == 1:
                        confidence_scores.append(1.0)
                    else:
                        confidence_scores.append(0.0)
                except Exception:
                    confidence_scores.append(0.0)

        confidence_scores = np.array(confidence_scores)

        # Filter by threshold and map to label names
        predictions = []

        for i, score in enumerate(confidence_scores):
            if score >= threshold:
                if self.kept_indices is not None:
                    if i < len(self.kept_indices):
                        original_idx = self.kept_indices[i]
                        skill_name = self.label_names[original_idx]
                    else:
                        continue
                else:
                    if i < len(self.label_names):
                        skill_name = self.label_names[i]
                    else:
                        skill_name = f"Unknown_Skill_{i}"

                predictions.append({"skill_name": skill_name, "confidence": float(score)})

        # Sort by confidence descending
        predictions.sort(key=lambda x: x["confidence"], reverse=True)

        return predictions