File size: 5,199 Bytes
225af6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
"""
Pytest configuration and fixtures for behavioral tests.
"""
import pytest
import numpy as np
import joblib
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from hopcroft_skill_classification_tool_competition.config import DATA_PATHS
from hopcroft_skill_classification_tool_competition.features import (
clean_github_text,
get_label_columns,
load_data_from_db
)
@pytest.fixture(scope="session")
def trained_model():
"""Load the trained model for testing."""
model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch_smote.pkl"
# Fallback to baseline if SMOTE model not found
if not model_path.exists():
model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl"
if not model_path.exists():
pytest.skip(f"Model not found at {model_path}. Please train a model first.")
return joblib.load(model_path)
@pytest.fixture(scope="session")
def tfidf_vectorizer(trained_model):
"""
Extract or reconstruct the TF-IDF vectorizer from the trained model.
Note: In a production setting, you should save and load the vectorizer separately.
For now, we'll create a new one fitted on the training data with max_features=1000.
"""
# Load training features to get vocabulary
features_path = Path(DATA_PATHS["features"])
if not features_path.exists():
pytest.skip(f"Features not found at {features_path}. Please run feature extraction first.")
# For testing purposes, we need to reconstruct the vectorizer with same params as training
# The model expects 1000 features based on the error message
from hopcroft_skill_classification_tool_competition.features import extract_tfidf_features
try:
df = load_data_from_db()
# Use max_features=1000 to match the trained model
_, vectorizer = extract_tfidf_features(df, max_features=1000)
return vectorizer
except Exception as e:
pytest.skip(f"Could not load vectorizer: {e}")
@pytest.fixture(scope="session")
def label_names():
"""Get the list of label names from the database."""
try:
df = load_data_from_db()
return get_label_columns(df)
except Exception as e:
pytest.skip(f"Could not load label names: {e}")
@pytest.fixture
def predict_text(trained_model, tfidf_vectorizer):
"""
Factory fixture that returns a function to predict skills from raw text.
Returns:
Function that takes text and returns predicted label indices
"""
def _predict(text: str, return_proba: bool = False):
"""
Predict skills from raw text.
Args:
text: Raw GitHub issue text
return_proba: If True, return probabilities instead of binary predictions
Returns:
If return_proba=False: indices of predicted labels (numpy array)
If return_proba=True: probability matrix (n_samples, n_labels)
"""
# Clean and transform text
cleaned = clean_github_text(text)
features = tfidf_vectorizer.transform([cleaned]).toarray()
if return_proba:
# Get probabilities for multi-output classifier
# Note: RandomForest returns probabilities per estimator
try:
probas = np.array([
estimator.predict_proba(features)[0][:, 1] # Get probability of class 1
for estimator in trained_model.estimators_
]).T
return probas
except Exception:
# Fallback to binary predictions if probabilities not available
return trained_model.predict(features)
# Get binary predictions
predictions = trained_model.predict(features)[0]
# Return indices of positive labels
return np.where(predictions == 1)[0]
return _predict
@pytest.fixture
def predict_with_labels(predict_text, label_names):
"""
Factory fixture that returns a function to predict skills with label names.
Returns:
Function that takes text and returns list of predicted label names
"""
def _predict(text: str):
"""
Predict skill labels from raw text.
Args:
text: Raw GitHub issue text
Returns:
List of predicted label names
"""
indices = predict_text(text)
return [label_names[i] for i in indices]
return _predict
def pytest_configure(config):
"""Register custom markers."""
config.addinivalue_line(
"markers", "invariance: Tests for invariance (changes should not affect predictions)"
)
config.addinivalue_line(
"markers", "directional: Tests for directional expectations (changes should affect predictions predictably)"
)
config.addinivalue_line(
"markers", "mft: Minimum Functionality Tests (basic examples with expected outputs)"
)
config.addinivalue_line(
"markers", "training: Tests for model training validation (loss, overfitting, devices)"
)
|