|
|
""" |
|
|
Integration tests for the feature extraction pipeline. |
|
|
|
|
|
Tests the combined functionality of dataset loading, text processing, |
|
|
and feature extraction working together. |
|
|
""" |
|
|
import pytest |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import tempfile |
|
|
import sqlite3 |
|
|
from pathlib import Path |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
load_data_from_db, |
|
|
create_feature_dataset, |
|
|
extract_tfidf_features, |
|
|
prepare_labels, |
|
|
get_text_columns, |
|
|
get_label_columns, |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.mark.integration |
|
|
class TestFeatureExtractionPipeline: |
|
|
"""Integration tests for complete feature extraction pipeline.""" |
|
|
|
|
|
def test_full_pipeline_from_dataframe_to_features(self, sample_dataframe): |
|
|
"""Test complete pipeline from DataFrame to features and labels.""" |
|
|
|
|
|
features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
|
|
|
|
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
|
|
|
assert features.shape[0] == len(labels) |
|
|
assert features.shape[0] == len(sample_dataframe) |
|
|
|
|
|
|
|
|
assert isinstance(features, np.ndarray) |
|
|
assert isinstance(labels, pd.DataFrame) |
|
|
|
|
|
|
|
|
assert not np.any(np.isnan(features)) |
|
|
assert not np.any(np.isinf(features)) |
|
|
assert not labels.isnull().any().any() |
|
|
|
|
|
def test_pipeline_with_database_to_features(self, temp_db): |
|
|
"""Test pipeline from database loading to feature extraction.""" |
|
|
|
|
|
df = load_data_from_db(temp_db) |
|
|
|
|
|
|
|
|
features, vectorizer = extract_tfidf_features(df, max_features=50) |
|
|
|
|
|
|
|
|
labels = prepare_labels(df) |
|
|
|
|
|
|
|
|
assert features.shape[0] == len(df) |
|
|
assert labels.shape[0] == len(df) |
|
|
assert features.shape[0] == labels.shape[0] |
|
|
|
|
|
def test_create_feature_dataset_integration(self, temp_db): |
|
|
"""Test the complete create_feature_dataset function.""" |
|
|
features, labels, feature_names, label_names = create_feature_dataset( |
|
|
db_path=temp_db, |
|
|
save_processed=False |
|
|
) |
|
|
|
|
|
|
|
|
assert isinstance(features, np.ndarray) |
|
|
assert isinstance(labels, pd.DataFrame) |
|
|
assert isinstance(feature_names, np.ndarray) |
|
|
assert isinstance(label_names, list) |
|
|
|
|
|
|
|
|
assert features.shape[0] == labels.shape[0] |
|
|
assert features.shape[1] == len(feature_names) |
|
|
assert labels.shape[1] == len(label_names) |
|
|
|
|
|
def test_pipeline_preserves_sample_count(self, sample_dataframe): |
|
|
"""Test that no samples are lost during pipeline.""" |
|
|
initial_count = len(sample_dataframe) |
|
|
|
|
|
features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
assert features.shape[0] == initial_count |
|
|
assert labels.shape[0] == initial_count |
|
|
|
|
|
def test_pipeline_with_various_text_lengths(self): |
|
|
"""Test pipeline with documents of varying lengths.""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': [ |
|
|
'short', |
|
|
'This is a medium length text with several words', |
|
|
'This is a very long text ' * 50, |
|
|
], |
|
|
'issue description': ['desc1', 'desc2', 'desc3'], |
|
|
'Label1': [1, 0, 1], |
|
|
'Label2': [0, 1, 1], |
|
|
}) |
|
|
|
|
|
features, _ = extract_tfidf_features(df, max_features=50) |
|
|
labels = prepare_labels(df) |
|
|
|
|
|
|
|
|
assert features.shape[0] == 3 |
|
|
assert labels.shape[0] == 3 |
|
|
|
|
|
|
|
|
assert not np.all(features == 0) |
|
|
|
|
|
|
|
|
@pytest.mark.integration |
|
|
class TestDataFlowConsistency: |
|
|
"""Integration tests for data consistency through the pipeline.""" |
|
|
|
|
|
def test_text_cleaning_affects_features(self, sample_dataframe): |
|
|
"""Test that text cleaning impacts feature extraction.""" |
|
|
|
|
|
dirty_df = sample_dataframe.copy() |
|
|
dirty_df['issue text'] = [ |
|
|
"Bug https://example.com with <b>HTML</b>", |
|
|
"Feature with ```code block```", |
|
|
"Update with extra spaces", |
|
|
"Test with 😀 emoji", |
|
|
"Normal clean text", |
|
|
] |
|
|
|
|
|
|
|
|
features_dirty, _ = extract_tfidf_features(dirty_df, max_features=50, min_df=1, max_df=1.0) |
|
|
|
|
|
|
|
|
clean_df = sample_dataframe.copy() |
|
|
clean_df['issue text'] = [ |
|
|
"Bug with HTML", |
|
|
"Feature with", |
|
|
"Update with extra spaces", |
|
|
"Test with emoji", |
|
|
"Normal clean text", |
|
|
] |
|
|
|
|
|
|
|
|
features_clean, _ = extract_tfidf_features(clean_df, max_features=50, min_df=1, max_df=1.0) |
|
|
|
|
|
|
|
|
|
|
|
assert features_dirty.shape == features_clean.shape |
|
|
|
|
|
def test_label_binarization_consistency(self): |
|
|
"""Test that label binarization is consistent.""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['text1', 'text2', 'text3'], |
|
|
'issue description': ['desc1', 'desc2', 'desc3'], |
|
|
'Label1': [0, 5, 10], |
|
|
'Label2': [1, 0, 100], |
|
|
}) |
|
|
|
|
|
labels = prepare_labels(df) |
|
|
|
|
|
|
|
|
assert set(labels.values.flatten()).issubset({0, 1}) |
|
|
|
|
|
|
|
|
assert labels.loc[0, 'Label1'] == 0 |
|
|
assert labels.loc[1, 'Label1'] == 1 |
|
|
assert labels.loc[2, 'Label1'] == 1 |
|
|
assert labels.loc[0, 'Label2'] == 1 |
|
|
assert labels.loc[1, 'Label2'] == 0 |
|
|
assert labels.loc[2, 'Label2'] == 1 |
|
|
|
|
|
def test_feature_label_alignment(self, sample_dataframe): |
|
|
"""Test that features and labels remain aligned.""" |
|
|
features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
|
|
labels = prepare_labels(sample_dataframe) |
|
|
|
|
|
|
|
|
for i in range(len(sample_dataframe)): |
|
|
|
|
|
assert features[i].shape[0] > 0 |
|
|
assert labels.iloc[i].shape[0] > 0 |
|
|
|
|
|
|
|
|
@pytest.mark.integration |
|
|
@pytest.mark.slow |
|
|
class TestLargeDatasetHandling: |
|
|
"""Integration tests with larger datasets (marked as slow).""" |
|
|
|
|
|
def test_pipeline_with_large_dataset(self): |
|
|
"""Test pipeline with a larger number of samples.""" |
|
|
|
|
|
n_samples = 1000 |
|
|
df = pd.DataFrame({ |
|
|
'issue text': [f'Issue number {i} with some text' for i in range(n_samples)], |
|
|
'issue description': [f'Description for issue {i}' for i in range(n_samples)], |
|
|
'Label1': np.random.randint(0, 2, n_samples), |
|
|
'Label2': np.random.randint(0, 2, n_samples), |
|
|
'Label3': np.random.randint(0, 2, n_samples), |
|
|
}) |
|
|
|
|
|
features, _ = extract_tfidf_features(df, max_features=500) |
|
|
labels = prepare_labels(df) |
|
|
|
|
|
assert features.shape[0] == n_samples |
|
|
assert labels.shape[0] == n_samples |
|
|
assert features.shape[1] <= 500 |
|
|
|
|
|
def test_pipeline_with_many_labels(self): |
|
|
"""Test pipeline with many label columns.""" |
|
|
n_labels = 50 |
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['text1', 'text2', 'text3'], |
|
|
'issue description': ['desc1', 'desc2', 'desc3'], |
|
|
}) |
|
|
|
|
|
|
|
|
for i in range(n_labels): |
|
|
df[f'Label_{i}'] = np.random.randint(0, 2, 3) |
|
|
|
|
|
labels = prepare_labels(df) |
|
|
|
|
|
assert labels.shape[1] == n_labels |
|
|
assert set(labels.values.flatten()).issubset({0, 1}) |
|
|
|
|
|
|
|
|
@pytest.mark.integration |
|
|
class TestSaveAndLoadIntegration: |
|
|
"""Integration tests for saving and loading processed data.""" |
|
|
|
|
|
def test_save_and_load_features(self, temp_db): |
|
|
"""Test saving features and labels then loading them back.""" |
|
|
with tempfile.TemporaryDirectory() as tmpdir: |
|
|
from hopcroft_skill_classification_tool_competition.features import ( |
|
|
create_feature_dataset, |
|
|
load_processed_data |
|
|
) |
|
|
|
|
|
|
|
|
with pytest.MonkeyPatch.context() as m: |
|
|
tmpdir_path = Path(tmpdir) |
|
|
tfidf_dir = tmpdir_path / "tfidf" |
|
|
tfidf_dir.mkdir(parents=True) |
|
|
|
|
|
|
|
|
features_orig, labels_orig, _, _ = create_feature_dataset( |
|
|
db_path=temp_db, |
|
|
save_processed=True |
|
|
) |
|
|
|
|
|
|
|
|
np.save(tfidf_dir / "features_tfidf.npy", features_orig) |
|
|
np.save(tfidf_dir / "labels_tfidf.npy", labels_orig.values) |
|
|
|
|
|
|
|
|
features_loaded = np.load(tfidf_dir / "features_tfidf.npy") |
|
|
labels_loaded = np.load(tfidf_dir / "labels_tfidf.npy") |
|
|
|
|
|
|
|
|
np.testing.assert_array_equal(features_orig, features_loaded) |
|
|
np.testing.assert_array_equal(labels_orig.values, labels_loaded) |
|
|
|
|
|
|
|
|
@pytest.mark.integration |
|
|
class TestErrorHandlingInPipeline: |
|
|
"""Integration tests for error handling throughout pipeline.""" |
|
|
|
|
|
def test_pipeline_with_missing_columns(self): |
|
|
"""Test pipeline behavior with missing expected columns.""" |
|
|
df = pd.DataFrame({ |
|
|
'wrong_col_1': ['text1', 'text2'], |
|
|
'wrong_col_2': ['desc1', 'desc2'], |
|
|
'Label1': [1, 0], |
|
|
}) |
|
|
|
|
|
|
|
|
text_cols = get_text_columns(df) |
|
|
assert len(text_cols) == 0 |
|
|
|
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="No text columns found"): |
|
|
extract_tfidf_features(df) |
|
|
|
|
|
def test_pipeline_with_all_nan_text(self): |
|
|
"""Test pipeline with all NaN text values raises appropriate error. |
|
|
|
|
|
TF-IDF cannot build a vocabulary from empty/NaN documents, |
|
|
so it should raise a ValueError with a descriptive message. |
|
|
""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': [None, None, None], |
|
|
'issue description': [None, None, None], |
|
|
'Label1': [1, 0, 1], |
|
|
}) |
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="empty vocabulary"): |
|
|
extract_tfidf_features(df, max_features=50) |
|
|
|
|
|
def test_pipeline_with_empty_labels(self): |
|
|
"""Test pipeline when no labels are present.""" |
|
|
df = pd.DataFrame({ |
|
|
'issue text': ['text1', 'text2'], |
|
|
'issue description': ['desc1', 'desc2'], |
|
|
|
|
|
}) |
|
|
|
|
|
label_cols = get_label_columns(df) |
|
|
|
|
|
|
|
|
assert len(label_cols) == 0 |
|
|
|