|
|
""" |
|
|
System tests for end-to-end workflows. |
|
|
|
|
|
Tests the complete system including training and inference pipelines. |
|
|
""" |
|
|
import pytest |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
from unittest.mock import patch, MagicMock |
|
|
import joblib |
|
|
|
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.multioutput import MultiOutputClassifier |
|
|
|
|
|
|
|
|
@pytest.mark.system |
|
|
@pytest.mark.slow |
|
|
class TestTrainingPipeline: |
|
|
"""System tests for model training pipeline.""" |
|
|
|
|
|
def test_complete_training_workflow(self, sample_dataframe): |
|
|
"""Test complete training workflow from data to model.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
|
|
|
features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
features, labels.values, test_size=0.2, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=10, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
predictions = model.predict(X_test) |
|
|
|
|
|
|
|
|
assert predictions.shape[0] == X_test.shape[0] |
|
|
assert predictions.shape[1] == y_test.shape[1] |
|
|
assert np.all((predictions == 0) | (predictions == 1)) |
|
|
|
|
|
def test_training_with_oversampling(self, sample_dataframe): |
|
|
"""Test training pipeline with oversampling.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
from imblearn.over_sampling import RandomOverSampler |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
|
|
|
features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
|
|
|
y_single = labels.iloc[:, 0].values |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
features, y_single, test_size=0.2, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
ros = RandomOverSampler(random_state=42) |
|
|
X_resampled, y_resampled = ros.fit_resample(X_train, y_train) |
|
|
|
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=10, random_state=42) |
|
|
rf.fit(X_resampled, y_resampled) |
|
|
|
|
|
|
|
|
predictions = rf.predict(X_test) |
|
|
|
|
|
|
|
|
assert len(predictions) == len(X_test) |
|
|
assert np.all((predictions == 0) | (predictions == 1)) |
|
|
|
|
|
def test_model_serialization(self, sample_dataframe): |
|
|
"""Test model can be saved and loaded.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
|
|
|
|
|
|
features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(features, labels.values) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f: |
|
|
model_path = f.name |
|
|
|
|
|
try: |
|
|
joblib.dump(model, model_path) |
|
|
loaded_model = joblib.load(model_path) |
|
|
|
|
|
|
|
|
pred_original = model.predict(features) |
|
|
pred_loaded = loaded_model.predict(features) |
|
|
|
|
|
np.testing.assert_array_equal(pred_original, pred_loaded) |
|
|
finally: |
|
|
Path(model_path).unlink() |
|
|
|
|
|
|
|
|
@pytest.mark.system |
|
|
class TestInferencePipeline: |
|
|
"""System tests for inference pipeline.""" |
|
|
|
|
|
def test_inference_on_new_text(self, sample_dataframe): |
|
|
"""Test inference pipeline on new unseen text.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
clean_github_text, |
|
|
) |
|
|
|
|
|
|
|
|
features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(features, labels.values) |
|
|
|
|
|
|
|
|
new_texts = [ |
|
|
"Fixed critical bug in authentication module", |
|
|
"Added new REST API endpoint for users", |
|
|
] |
|
|
|
|
|
|
|
|
cleaned_texts = [clean_github_text(text) for text in new_texts] |
|
|
new_features = vectorizer.transform(cleaned_texts).toarray() |
|
|
|
|
|
|
|
|
predictions = model.predict(new_features) |
|
|
|
|
|
|
|
|
assert predictions.shape[0] == len(new_texts) |
|
|
assert predictions.shape[1] == labels.shape[1] |
|
|
assert np.all((predictions == 0) | (predictions == 1)) |
|
|
|
|
|
def test_inference_with_empty_input(self, sample_dataframe): |
|
|
"""Test inference handles empty input gracefully.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
clean_github_text, |
|
|
) |
|
|
|
|
|
|
|
|
features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(features, labels.values) |
|
|
|
|
|
|
|
|
empty_text = "" |
|
|
cleaned = clean_github_text(empty_text) |
|
|
new_features = vectorizer.transform([cleaned]).toarray() |
|
|
|
|
|
|
|
|
predictions = model.predict(new_features) |
|
|
|
|
|
assert predictions.shape[0] == 1 |
|
|
assert predictions.shape[1] == labels.shape[1] |
|
|
|
|
|
def test_batch_inference(self, sample_dataframe): |
|
|
"""Test inference on batch of samples.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
|
|
|
|
|
|
features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(features, labels.values) |
|
|
|
|
|
|
|
|
predictions = model.predict(features) |
|
|
|
|
|
assert predictions.shape == labels.shape |
|
|
assert np.all((predictions == 0) | (predictions == 1)) |
|
|
|
|
|
|
|
|
@pytest.mark.system |
|
|
@pytest.mark.requires_data |
|
|
class TestEndToEndDataFlow: |
|
|
"""System tests for complete data flow from raw to predictions.""" |
|
|
|
|
|
def test_full_pipeline_database_to_predictions(self, temp_db): |
|
|
"""Test complete pipeline from database to predictions.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
load_data_from_db, |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
|
|
|
df = load_data_from_db(temp_db) |
|
|
|
|
|
|
|
|
features, vectorizer = extract_tfidf_features(df, max_features=50) |
|
|
labels = prepare_labels(df) |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
features, labels.values, test_size=0.4, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
predictions = model.predict(X_test) |
|
|
|
|
|
|
|
|
from sklearn.metrics import accuracy_score |
|
|
|
|
|
|
|
|
accuracies = [] |
|
|
for i in range(y_test.shape[1]): |
|
|
acc = accuracy_score(y_test[:, i], predictions[:, i]) |
|
|
accuracies.append(acc) |
|
|
|
|
|
|
|
|
assert np.mean(accuracies) > 0.4 |
|
|
|
|
|
|
|
|
@pytest.mark.system |
|
|
class TestModelValidation: |
|
|
"""System tests for model validation workflows.""" |
|
|
|
|
|
def test_cross_validation_workflow(self, sample_dataframe): |
|
|
"""Test cross-validation workflow.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
from sklearn.model_selection import cross_val_score |
|
|
|
|
|
|
|
|
features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
|
|
|
y_single = labels.iloc[:, 0].values |
|
|
|
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
|
|
|
|
|
|
scores = cross_val_score(rf, features, y_single, cv=2, scoring='accuracy') |
|
|
|
|
|
assert len(scores) == 2 |
|
|
assert all(0 <= score <= 1 for score in scores) |
|
|
|
|
|
def test_grid_search_workflow(self, sample_dataframe): |
|
|
"""Test grid search workflow.""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
from sklearn.model_selection import GridSearchCV |
|
|
|
|
|
|
|
|
features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
|
|
|
y_single = labels.iloc[:, 0].values |
|
|
|
|
|
|
|
|
param_grid = { |
|
|
'n_estimators': [5, 10], |
|
|
'max_depth': [5, 10], |
|
|
} |
|
|
|
|
|
rf = RandomForestClassifier(random_state=42) |
|
|
grid_search = GridSearchCV(rf, param_grid, cv=2, scoring='accuracy') |
|
|
grid_search.fit(features, y_single) |
|
|
|
|
|
|
|
|
assert hasattr(grid_search, 'best_params_') |
|
|
assert hasattr(grid_search, 'best_score_') |
|
|
assert grid_search.best_score_ >= 0 |
|
|
|
|
|
|
|
|
@pytest.mark.system |
|
|
@pytest.mark.regression |
|
|
class TestRegressionScenarios: |
|
|
"""Regression tests for known issues and edge cases.""" |
|
|
|
|
|
def test_empty_feature_vectors_handling(self): |
|
|
""" |
|
|
Regression test: Ensure empty feature vectors don't crash training. |
|
|
|
|
|
This was identified in Great Expectations TEST 2 - 25 samples with |
|
|
zero features after TF-IDF extraction. |
|
|
""" |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.multioutput import MultiOutputClassifier |
|
|
|
|
|
|
|
|
X = np.array([ |
|
|
[0.1, 0.2, 0.3], |
|
|
[0.0, 0.0, 0.0], |
|
|
[0.4, 0.5, 0.6], |
|
|
[0.0, 0.0, 0.0], |
|
|
]) |
|
|
|
|
|
y = np.array([ |
|
|
[1, 0], |
|
|
[0, 1], |
|
|
[1, 1], |
|
|
[0, 0], |
|
|
]) |
|
|
|
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(X, y) |
|
|
|
|
|
predictions = model.predict(X) |
|
|
assert predictions.shape == y.shape |
|
|
|
|
|
def test_zero_occurrence_labels_handling(self): |
|
|
""" |
|
|
Regression test: Handle labels with zero occurrences. |
|
|
|
|
|
This was identified in Great Expectations TEST 5 - 75 labels with |
|
|
zero occurrences in the dataset. |
|
|
""" |
|
|
from hopcroft_skill_classification_tool_competition.features import get_label_columns |
|
|
|
|
|
|
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['text1', 'text2', 'text3'], |
|
|
'Label1': [1, 1, 0], |
|
|
'Label2': [0, 0, 0], |
|
|
'Label3': [1, 0, 1], |
|
|
}) |
|
|
|
|
|
label_cols = get_label_columns(df) |
|
|
|
|
|
|
|
|
assert 'Label1' in label_cols |
|
|
assert 'Label2' in label_cols |
|
|
assert 'Label3' in label_cols |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_high_sparsity_features(self): |
|
|
""" |
|
|
Regression test: Handle very sparse features (>99% zeros). |
|
|
|
|
|
This was identified in Great Expectations TEST 6 - 99.88% sparsity. |
|
|
""" |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
|
|
|
|
|
|
X = np.zeros((100, 1000)) |
|
|
|
|
|
|
|
|
for i in range(100): |
|
|
indices = np.random.choice(1000, size=1, replace=False) |
|
|
X[i, indices] = np.random.rand(1) |
|
|
|
|
|
y = np.random.randint(0, 2, size=100) |
|
|
|
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
rf.fit(X, y) |
|
|
|
|
|
predictions = rf.predict(X) |
|
|
assert len(predictions) == len(y) |
|
|
|
|
|
def test_duplicate_samples_detection(self): |
|
|
""" |
|
|
Regression test: Detect duplicate samples. |
|
|
|
|
|
This was identified in Deepchecks validation - 481 duplicates (6.72%). |
|
|
""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['duplicate', 'duplicate', 'unique'], |
|
|
'issue description': ['desc', 'desc', 'different'], |
|
|
'Label1': [1, 1, 0], |
|
|
}) |
|
|
|
|
|
|
|
|
duplicates = df[['issue text', 'issue description']].duplicated() |
|
|
|
|
|
assert duplicates.sum() == 1 |
|
|
|
|
|
|
|
|
df_cleaned = df.drop_duplicates(subset=['issue text', 'issue description']) |
|
|
assert len(df_cleaned) == 2 |
|
|
|
|
|
|
|
|
@pytest.mark.system |
|
|
@pytest.mark.acceptance |
|
|
class TestAcceptanceCriteria: |
|
|
"""Acceptance tests verifying requirements are met.""" |
|
|
|
|
|
def test_multi_label_classification_support(self, sample_dataframe): |
|
|
""" |
|
|
Acceptance test: System supports multi-label classification. |
|
|
|
|
|
Requirement: Each issue can have multiple skill labels. |
|
|
""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
|
|
|
features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(features, labels.values) |
|
|
|
|
|
|
|
|
predictions = model.predict(features) |
|
|
|
|
|
|
|
|
labels_per_sample = predictions.sum(axis=1) |
|
|
assert np.any(labels_per_sample > 1), "System should support multiple labels per sample" |
|
|
|
|
|
def test_handles_github_text_format(self): |
|
|
""" |
|
|
Acceptance test: System handles GitHub issue text format. |
|
|
|
|
|
Requirement: Process text from GitHub issues with URLs, code, etc. |
|
|
""" |
|
|
from hopcroft_skill_classification_tool_competition.features import clean_github_text |
|
|
|
|
|
github_text = """ |
|
|
Fixed bug in authentication #123 |
|
|
|
|
|
See: https://github.com/repo/issues/123 |
|
|
|
|
|
```python |
|
|
def login(user): |
|
|
return authenticate(user) |
|
|
``` |
|
|
|
|
|
Related to <b>security</b> improvements 🔒 |
|
|
""" |
|
|
|
|
|
cleaned = clean_github_text(github_text) |
|
|
|
|
|
|
|
|
assert "https://" not in cleaned |
|
|
assert "```" not in cleaned |
|
|
assert "<b>" not in cleaned |
|
|
assert len(cleaned) > 0 |
|
|
|
|
|
def test_produces_binary_predictions(self, sample_dataframe): |
|
|
""" |
|
|
Acceptance test: System produces binary predictions (0 or 1). |
|
|
|
|
|
Requirement: Clear yes/no predictions for each skill. |
|
|
""" |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
) |
|
|
|
|
|
features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=5, random_state=42) |
|
|
model = MultiOutputClassifier(rf) |
|
|
model.fit(features, labels.values) |
|
|
|
|
|
predictions = model.predict(features) |
|
|
|
|
|
|
|
|
assert np.all((predictions == 0) | (predictions == 1)) |
|
|
|