Spaces:
Sleeping
Sleeping
| """ | |
| System tests for end-to-end workflows. | |
| Tests the complete system including training and inference pipelines. | |
| """ | |
| import pytest | |
| import numpy as np | |
| import pandas as pd | |
| import tempfile | |
| from pathlib import Path | |
| from unittest.mock import patch, MagicMock | |
| import joblib | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.multioutput import MultiOutputClassifier | |
| class TestTrainingPipeline: | |
| """System tests for model training pipeline.""" | |
| def test_complete_training_workflow(self, sample_dataframe): | |
| """Test complete training workflow from data to model.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| from sklearn.model_selection import train_test_split | |
| # Extract features | |
| features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| features, labels.values, test_size=0.2, random_state=42 | |
| ) | |
| # Train model | |
| rf = RandomForestClassifier(n_estimators=10, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(X_train, y_train) | |
| # Predict | |
| predictions = model.predict(X_test) | |
| # Verify | |
| assert predictions.shape[0] == X_test.shape[0] | |
| assert predictions.shape[1] == y_test.shape[1] | |
| assert np.all((predictions == 0) | (predictions == 1)) | |
| def test_training_with_oversampling(self, sample_dataframe): | |
| """Test training pipeline with oversampling.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| from imblearn.over_sampling import RandomOverSampler | |
| from sklearn.model_selection import train_test_split | |
| # Prepare data | |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| # Use only one label column for oversampling | |
| y_single = labels.iloc[:, 0].values | |
| # Split | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| features, y_single, test_size=0.2, random_state=42 | |
| ) | |
| # Oversample | |
| ros = RandomOverSampler(random_state=42) | |
| X_resampled, y_resampled = ros.fit_resample(X_train, y_train) | |
| # Train | |
| rf = RandomForestClassifier(n_estimators=10, random_state=42) | |
| rf.fit(X_resampled, y_resampled) | |
| # Predict | |
| predictions = rf.predict(X_test) | |
| # Verify | |
| assert len(predictions) == len(X_test) | |
| assert np.all((predictions == 0) | (predictions == 1)) | |
| def test_model_serialization(self, sample_dataframe): | |
| """Test model can be saved and loaded.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| # Train simple model | |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(features, labels.values) | |
| # Save and load | |
| with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f: | |
| model_path = f.name | |
| try: | |
| joblib.dump(model, model_path) | |
| loaded_model = joblib.load(model_path) | |
| # Verify predictions match | |
| pred_original = model.predict(features) | |
| pred_loaded = loaded_model.predict(features) | |
| np.testing.assert_array_equal(pred_original, pred_loaded) | |
| finally: | |
| Path(model_path).unlink() | |
| class TestInferencePipeline: | |
| """System tests for inference pipeline.""" | |
| def test_inference_on_new_text(self, sample_dataframe): | |
| """Test inference pipeline on new unseen text.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| clean_github_text, | |
| ) | |
| # Train model | |
| features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(features, labels.values) | |
| # New text | |
| new_texts = [ | |
| "Fixed critical bug in authentication module", | |
| "Added new REST API endpoint for users", | |
| ] | |
| # Process new text | |
| cleaned_texts = [clean_github_text(text) for text in new_texts] | |
| new_features = vectorizer.transform(cleaned_texts).toarray() | |
| # Predict | |
| predictions = model.predict(new_features) | |
| # Verify | |
| assert predictions.shape[0] == len(new_texts) | |
| assert predictions.shape[1] == labels.shape[1] | |
| assert np.all((predictions == 0) | (predictions == 1)) | |
| def test_inference_with_empty_input(self, sample_dataframe): | |
| """Test inference handles empty input gracefully.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| clean_github_text, | |
| ) | |
| # Train model | |
| features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(features, labels.values) | |
| # Empty text | |
| empty_text = "" | |
| cleaned = clean_github_text(empty_text) | |
| new_features = vectorizer.transform([cleaned]).toarray() | |
| # Should not crash | |
| predictions = model.predict(new_features) | |
| assert predictions.shape[0] == 1 | |
| assert predictions.shape[1] == labels.shape[1] | |
| def test_batch_inference(self, sample_dataframe): | |
| """Test inference on batch of samples.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| # Train model | |
| features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(features, labels.values) | |
| # Batch prediction | |
| predictions = model.predict(features) | |
| assert predictions.shape == labels.shape | |
| assert np.all((predictions == 0) | (predictions == 1)) | |
| class TestEndToEndDataFlow: | |
| """System tests for complete data flow from raw to predictions.""" | |
| def test_full_pipeline_database_to_predictions(self, temp_db): | |
| """Test complete pipeline from database to predictions.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| load_data_from_db, | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| from sklearn.model_selection import train_test_split | |
| # Load data | |
| df = load_data_from_db(temp_db) | |
| # Extract features | |
| features, vectorizer = extract_tfidf_features(df, max_features=50) | |
| labels = prepare_labels(df) | |
| # Split | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| features, labels.values, test_size=0.4, random_state=42 | |
| ) | |
| # Train | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(X_train, y_train) | |
| # Predict | |
| predictions = model.predict(X_test) | |
| # Evaluate (simple check) | |
| from sklearn.metrics import accuracy_score | |
| # Per-label accuracy | |
| accuracies = [] | |
| for i in range(y_test.shape[1]): | |
| acc = accuracy_score(y_test[:, i], predictions[:, i]) | |
| accuracies.append(acc) | |
| # Should have some predictive power (better than random for at least one label) | |
| assert np.mean(accuracies) > 0.4 # Very lenient threshold for small test data | |
| class TestModelValidation: | |
| """System tests for model validation workflows.""" | |
| def test_cross_validation_workflow(self, sample_dataframe): | |
| """Test cross-validation workflow.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| from sklearn.model_selection import cross_val_score | |
| # Prepare data | |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| # Use single label for CV | |
| y_single = labels.iloc[:, 0].values | |
| # Cross-validation | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| # Should not crash (though scores may be poor with small data) | |
| scores = cross_val_score(rf, features, y_single, cv=2, scoring='accuracy') | |
| assert len(scores) == 2 | |
| assert all(0 <= score <= 1 for score in scores) | |
| def test_grid_search_workflow(self, sample_dataframe): | |
| """Test grid search workflow.""" | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| from sklearn.model_selection import GridSearchCV | |
| # Prepare data | |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| # Use single label | |
| y_single = labels.iloc[:, 0].values | |
| # Small grid search | |
| param_grid = { | |
| 'n_estimators': [5, 10], | |
| 'max_depth': [5, 10], | |
| } | |
| rf = RandomForestClassifier(random_state=42) | |
| grid_search = GridSearchCV(rf, param_grid, cv=2, scoring='accuracy') | |
| grid_search.fit(features, y_single) | |
| # Verify | |
| assert hasattr(grid_search, 'best_params_') | |
| assert hasattr(grid_search, 'best_score_') | |
| assert grid_search.best_score_ >= 0 | |
| class TestRegressionScenarios: | |
| """Regression tests for known issues and edge cases.""" | |
| def test_empty_feature_vectors_handling(self): | |
| """ | |
| Regression test: Ensure empty feature vectors don't crash training. | |
| This was identified in Great Expectations TEST 2 - 25 samples with | |
| zero features after TF-IDF extraction. | |
| """ | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.multioutput import MultiOutputClassifier | |
| # Create data with some zero vectors | |
| X = np.array([ | |
| [0.1, 0.2, 0.3], | |
| [0.0, 0.0, 0.0], # Empty vector | |
| [0.4, 0.5, 0.6], | |
| [0.0, 0.0, 0.0], # Another empty vector | |
| ]) | |
| y = np.array([ | |
| [1, 0], | |
| [0, 1], | |
| [1, 1], | |
| [0, 0], | |
| ]) | |
| # Should not crash | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(X, y) | |
| predictions = model.predict(X) | |
| assert predictions.shape == y.shape | |
| def test_zero_occurrence_labels_handling(self): | |
| """ | |
| Regression test: Handle labels with zero occurrences. | |
| This was identified in Great Expectations TEST 5 - 75 labels with | |
| zero occurrences in the dataset. | |
| """ | |
| from hopcroft_skill_classification_tool_competition.features import get_label_columns | |
| # Create dataframe with some zero-occurrence labels | |
| df = pd.DataFrame({ | |
| 'issue text': ['text1', 'text2', 'text3'], | |
| 'Label1': [1, 1, 0], # Has occurrences | |
| 'Label2': [0, 0, 0], # Zero occurrences | |
| 'Label3': [1, 0, 1], # Has occurrences | |
| }) | |
| label_cols = get_label_columns(df) | |
| # Should include all labels | |
| assert 'Label1' in label_cols | |
| assert 'Label2' in label_cols | |
| assert 'Label3' in label_cols | |
| # Training code should filter these out before stratification | |
| # This test just verifies detection works | |
| def test_high_sparsity_features(self): | |
| """ | |
| Regression test: Handle very sparse features (>99% zeros). | |
| This was identified in Great Expectations TEST 6 - 99.88% sparsity. | |
| """ | |
| from sklearn.ensemble import RandomForestClassifier | |
| # Create highly sparse feature matrix | |
| X = np.zeros((100, 1000)) | |
| # Only 0.12% non-zero values (very sparse) | |
| for i in range(100): | |
| indices = np.random.choice(1000, size=1, replace=False) | |
| X[i, indices] = np.random.rand(1) | |
| y = np.random.randint(0, 2, size=100) | |
| # Should handle high sparsity without crashing | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| rf.fit(X, y) | |
| predictions = rf.predict(X) | |
| assert len(predictions) == len(y) | |
| def test_duplicate_samples_detection(self): | |
| """ | |
| Regression test: Detect duplicate samples. | |
| This was identified in Deepchecks validation - 481 duplicates (6.72%). | |
| """ | |
| df = pd.DataFrame({ | |
| 'issue text': ['duplicate', 'duplicate', 'unique'], | |
| 'issue description': ['desc', 'desc', 'different'], | |
| 'Label1': [1, 1, 0], | |
| }) | |
| # Check for duplicates | |
| duplicates = df[['issue text', 'issue description']].duplicated() | |
| assert duplicates.sum() == 1 # One duplicate found | |
| # Removal should be done in data cleaning pipeline | |
| df_cleaned = df.drop_duplicates(subset=['issue text', 'issue description']) | |
| assert len(df_cleaned) == 2 | |
| class TestAcceptanceCriteria: | |
| """Acceptance tests verifying requirements are met.""" | |
| def test_multi_label_classification_support(self, sample_dataframe): | |
| """ | |
| Acceptance test: System supports multi-label classification. | |
| Requirement: Each issue can have multiple skill labels. | |
| """ | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| # Train multi-output model | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(features, labels.values) | |
| # Predict multiple labels | |
| predictions = model.predict(features) | |
| # Verify multiple labels can be predicted | |
| labels_per_sample = predictions.sum(axis=1) | |
| assert np.any(labels_per_sample > 1), "System should support multiple labels per sample" | |
| def test_handles_github_text_format(self): | |
| """ | |
| Acceptance test: System handles GitHub issue text format. | |
| Requirement: Process text from GitHub issues with URLs, code, etc. | |
| """ | |
| from hopcroft_skill_classification_tool_competition.features import clean_github_text | |
| github_text = """ | |
| Fixed bug in authentication #123 | |
| See: https://github.com/repo/issues/123 | |
| ```python | |
| def login(user): | |
| return authenticate(user) | |
| ``` | |
| Related to <b>security</b> improvements 🔒 | |
| """ | |
| cleaned = clean_github_text(github_text) | |
| # Should remove noise but keep meaningful content | |
| assert "https://" not in cleaned | |
| assert "```" not in cleaned | |
| assert "<b>" not in cleaned | |
| assert len(cleaned) > 0 | |
| def test_produces_binary_predictions(self, sample_dataframe): | |
| """ | |
| Acceptance test: System produces binary predictions (0 or 1). | |
| Requirement: Clear yes/no predictions for each skill. | |
| """ | |
| from hopcroft_skill_classification_tool_competition.features import ( | |
| extract_tfidf_features, | |
| prepare_labels, | |
| ) | |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) | |
| labels = prepare_labels(sample_dataframe) | |
| rf = RandomForestClassifier(n_estimators=5, random_state=42) | |
| model = MultiOutputClassifier(rf) | |
| model.fit(features, labels.values) | |
| predictions = model.predict(features) | |
| # All predictions should be 0 or 1 | |
| assert np.all((predictions == 0) | (predictions == 1)) | |