Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| import joblib | |
| import numpy as np | |
| from hopcroft_skill_classification_tool_competition.config import ( | |
| API_CONFIG, | |
| DATA_PATHS, | |
| EMBEDDING_MODEL_NAME, | |
| MODELS_DIR, | |
| ) | |
| from hopcroft_skill_classification_tool_competition.features import clean_github_text | |
| class SkillPredictor: | |
| """ | |
| Skill prediction class that supports both TF-IDF and Embedding-based models. | |
| The feature_type determines how text is transformed: | |
| - "tfidf": Uses saved TfidfVectorizer | |
| - "embedding": Uses SentenceTransformer to generate embeddings | |
| """ | |
| def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None): | |
| """ | |
| Initialize the SkillPredictor. | |
| Args: | |
| model_name: Name of the model file. If None, uses API_CONFIG["model_name"] | |
| feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"] | |
| """ | |
| # Use config defaults if not specified | |
| self.model_name = model_name or API_CONFIG["model_name"] | |
| self.feature_type = feature_type or API_CONFIG["feature_type"] | |
| self.model_path = MODELS_DIR / self.model_name | |
| self.labels_path = MODELS_DIR / "label_names.pkl" | |
| # Paths for kept indices (may be in different locations) | |
| self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy" | |
| self.kept_indices_path_tfidf = ( | |
| Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy" | |
| ) | |
| self.kept_indices_path_emb = ( | |
| Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy" | |
| ) | |
| self.model = None | |
| self.vectorizer = None # TF-IDF vectorizer or SentenceTransformer | |
| self.label_names = None | |
| self.kept_indices = None | |
| self._load_artifacts() | |
| def _load_artifacts(self): | |
| """Load model and required artifacts based on feature_type.""" | |
| print(f"Loading model from {self.model_path}...") | |
| if not self.model_path.exists(): | |
| raise FileNotFoundError(f"Model not found at {self.model_path}") | |
| self.model = joblib.load(self.model_path) | |
| # Load vectorizer/encoder based on feature type | |
| if self.feature_type == "tfidf": | |
| self._load_tfidf_vectorizer() | |
| elif self.feature_type == "embedding": | |
| self._load_embedding_model() | |
| else: | |
| raise ValueError( | |
| f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'" | |
| ) | |
| # Load label names | |
| print(f"Loading label names from {self.labels_path}...") | |
| if not self.labels_path.exists(): | |
| raise FileNotFoundError(f"Label names not found at {self.labels_path}") | |
| self.label_names = joblib.load(self.labels_path) | |
| # Load kept indices if available | |
| if self.kept_indices_path_models.exists(): | |
| print(f"Loading kept indices from {self.kept_indices_path_models}") | |
| self.kept_indices = np.load(self.kept_indices_path_models) | |
| elif self.kept_indices_path_emb.exists(): | |
| print(f"Loading kept indices from {self.kept_indices_path_emb}") | |
| self.kept_indices = np.load(self.kept_indices_path_emb) | |
| elif self.kept_indices_path_tfidf.exists(): | |
| print(f"Loading kept indices from {self.kept_indices_path_tfidf}") | |
| self.kept_indices = np.load(self.kept_indices_path_tfidf) | |
| else: | |
| print("No kept_label_indices.npy found. Assuming all labels are used.") | |
| self.kept_indices = None | |
| def _load_tfidf_vectorizer(self): | |
| """Load the TF-IDF vectorizer.""" | |
| vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl" | |
| print(f"Loading TF-IDF vectorizer from {vectorizer_path}...") | |
| if not vectorizer_path.exists(): | |
| raise FileNotFoundError( | |
| f"TF-IDF vectorizer not found at {vectorizer_path}. " | |
| "Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features" | |
| ) | |
| self.vectorizer = joblib.load(vectorizer_path) | |
| def _load_embedding_model(self): | |
| """Load the SentenceTransformer model for embeddings.""" | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| except ImportError as e: | |
| raise ImportError( | |
| f"sentence-transformers is required for embedding-based models. " | |
| f"Install with: pip install sentence-transformers. Error: {e}" | |
| ) from e | |
| print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...") | |
| self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME) | |
| def _transform_text(self, text: str) -> np.ndarray: | |
| """ | |
| Transform text to features based on feature_type. | |
| Args: | |
| text: Cleaned input text | |
| Returns: | |
| Feature array ready for model prediction | |
| """ | |
| if self.feature_type == "tfidf": | |
| # TF-IDF: use stemming, return sparse matrix converted to array | |
| cleaned = clean_github_text(text, use_stemming=True) | |
| features = self.vectorizer.transform([cleaned]) | |
| return features | |
| else: | |
| # Embedding: no stemming (LLMs need full words) | |
| cleaned = clean_github_text(text, use_stemming=False) | |
| features = self.vectorizer.encode([cleaned], convert_to_numpy=True) | |
| return features | |
| def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]: | |
| """ | |
| Predict skills for a given text. | |
| Args: | |
| text: Input text (issue title + body) | |
| threshold: Confidence threshold for binary classification | |
| Returns: | |
| List of dicts with 'skill_name' and 'confidence' | |
| """ | |
| # Transform text to features | |
| features = self._transform_text(text) | |
| # Predict | |
| # MultiOutputClassifier predict_proba returns a list of arrays (one per class) | |
| # Each array is (n_samples, 2) -> [prob_0, prob_1] | |
| probas_list = self.model.predict_proba(features) | |
| # Extract positive class probabilities | |
| confidence_scores = [] | |
| for i, prob in enumerate(probas_list): | |
| if prob.shape[1] >= 2: | |
| confidence_scores.append(prob[0][1]) | |
| else: | |
| # Only one class present | |
| try: | |
| estimator = self.model.estimators_[i] | |
| classes = estimator.classes_ | |
| if len(classes) == 1 and classes[0] == 1: | |
| confidence_scores.append(1.0) | |
| else: | |
| confidence_scores.append(0.0) | |
| except Exception: | |
| confidence_scores.append(0.0) | |
| confidence_scores = np.array(confidence_scores) | |
| # Filter by threshold and map to label names | |
| predictions = [] | |
| for i, score in enumerate(confidence_scores): | |
| if score >= threshold: | |
| if self.kept_indices is not None: | |
| if i < len(self.kept_indices): | |
| original_idx = self.kept_indices[i] | |
| skill_name = self.label_names[original_idx] | |
| else: | |
| continue | |
| else: | |
| if i < len(self.label_names): | |
| skill_name = self.label_names[i] | |
| else: | |
| skill_name = f"Unknown_Skill_{i}" | |
| predictions.append({"skill_name": skill_name, "confidence": float(score)}) | |
| # Sort by confidence descending | |
| predictions.sort(key=lambda x: x["confidence"], reverse=True) | |
| return predictions | |