maurocarlu's picture
fix integration tests
ea5abff
"""
Integration tests for the feature extraction pipeline.
Tests the combined functionality of dataset loading, text processing,
and feature extraction working together.
"""
import pytest
import numpy as np
import pandas as pd
import tempfile
import sqlite3
from pathlib import Path
from hopcroft_skill_classification_tool_competition.features import (
load_data_from_db,
create_feature_dataset,
extract_tfidf_features,
prepare_labels,
get_text_columns,
get_label_columns,
)
@pytest.mark.integration
class TestFeatureExtractionPipeline:
"""Integration tests for complete feature extraction pipeline."""
def test_full_pipeline_from_dataframe_to_features(self, sample_dataframe):
"""Test complete pipeline from DataFrame to features and labels."""
# Extract features
features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50)
# Prepare labels
labels = prepare_labels(sample_dataframe)
# Verify alignment
assert features.shape[0] == len(labels)
assert features.shape[0] == len(sample_dataframe)
# Verify data types
assert isinstance(features, np.ndarray)
assert isinstance(labels, pd.DataFrame)
# Verify no NaN or Inf values
assert not np.any(np.isnan(features))
assert not np.any(np.isinf(features))
assert not labels.isnull().any().any()
def test_pipeline_with_database_to_features(self, temp_db):
"""Test pipeline from database loading to feature extraction."""
# Load from database
df = load_data_from_db(temp_db)
# Extract features
features, vectorizer = extract_tfidf_features(df, max_features=50)
# Prepare labels
labels = prepare_labels(df)
# Verify complete pipeline
assert features.shape[0] == len(df)
assert labels.shape[0] == len(df)
assert features.shape[0] == labels.shape[0]
def test_create_feature_dataset_integration(self, temp_db):
"""Test the complete create_feature_dataset function."""
features, labels, feature_names, label_names = create_feature_dataset(
db_path=temp_db,
save_processed=False
)
# Verify outputs
assert isinstance(features, np.ndarray)
assert isinstance(labels, pd.DataFrame)
assert isinstance(feature_names, np.ndarray) # sklearn returns ndarray
assert isinstance(label_names, list)
# Verify shapes match
assert features.shape[0] == labels.shape[0]
assert features.shape[1] == len(feature_names)
assert labels.shape[1] == len(label_names)
def test_pipeline_preserves_sample_count(self, sample_dataframe):
"""Test that no samples are lost during pipeline."""
initial_count = len(sample_dataframe)
features, _ = extract_tfidf_features(sample_dataframe, max_features=50)
labels = prepare_labels(sample_dataframe)
assert features.shape[0] == initial_count
assert labels.shape[0] == initial_count
def test_pipeline_with_various_text_lengths(self):
"""Test pipeline with documents of varying lengths."""
df = pd.DataFrame({
'issue text': [
'short',
'This is a medium length text with several words',
'This is a very long text ' * 50, # Very long
],
'issue description': ['desc1', 'desc2', 'desc3'],
'Label1': [1, 0, 1],
'Label2': [0, 1, 1],
})
features, _ = extract_tfidf_features(df, max_features=50)
labels = prepare_labels(df)
# All documents should be processed
assert features.shape[0] == 3
assert labels.shape[0] == 3
# Features should have reasonable values
assert not np.all(features == 0)
@pytest.mark.integration
class TestDataFlowConsistency:
"""Integration tests for data consistency through the pipeline."""
def test_text_cleaning_affects_features(self, sample_dataframe):
"""Test that text cleaning impacts feature extraction."""
# Add dirty text
dirty_df = sample_dataframe.copy()
dirty_df['issue text'] = [
"Bug https://example.com with <b>HTML</b>",
"Feature with ```code block```",
"Update with extra spaces",
"Test with 😀 emoji",
"Normal clean text",
]
# Use min_df=1 and max_df=1.0 for small test datasets to avoid empty vocabulary
features_dirty, _ = extract_tfidf_features(dirty_df, max_features=50, min_df=1, max_df=1.0)
# Clean version
clean_df = sample_dataframe.copy()
clean_df['issue text'] = [
"Bug with HTML",
"Feature with",
"Update with extra spaces",
"Test with emoji",
"Normal clean text",
]
# Use min_df=1 and max_df=1.0 for small test datasets
features_clean, _ = extract_tfidf_features(clean_df, max_features=50, min_df=1, max_df=1.0)
# Features should be similar (cleaning is applied to both)
# But not necessarily identical due to stemming
assert features_dirty.shape == features_clean.shape
def test_label_binarization_consistency(self):
"""Test that label binarization is consistent."""
df = pd.DataFrame({
'issue text': ['text1', 'text2', 'text3'],
'issue description': ['desc1', 'desc2', 'desc3'],
'Label1': [0, 5, 10], # Different counts
'Label2': [1, 0, 100],
})
labels = prepare_labels(df)
# All values should be 0 or 1
assert set(labels.values.flatten()).issubset({0, 1})
# Specific checks
assert labels.loc[0, 'Label1'] == 0
assert labels.loc[1, 'Label1'] == 1
assert labels.loc[2, 'Label1'] == 1
assert labels.loc[0, 'Label2'] == 1
assert labels.loc[1, 'Label2'] == 0
assert labels.loc[2, 'Label2'] == 1
def test_feature_label_alignment(self, sample_dataframe):
"""Test that features and labels remain aligned."""
features, _ = extract_tfidf_features(sample_dataframe, max_features=50)
labels = prepare_labels(sample_dataframe)
# Check alignment by comparing indices
for i in range(len(sample_dataframe)):
# Each row should correspond to the same sample
assert features[i].shape[0] > 0 # Has features
assert labels.iloc[i].shape[0] > 0 # Has labels
@pytest.mark.integration
@pytest.mark.slow
class TestLargeDatasetHandling:
"""Integration tests with larger datasets (marked as slow)."""
def test_pipeline_with_large_dataset(self):
"""Test pipeline with a larger number of samples."""
# Create larger dataset
n_samples = 1000
df = pd.DataFrame({
'issue text': [f'Issue number {i} with some text' for i in range(n_samples)],
'issue description': [f'Description for issue {i}' for i in range(n_samples)],
'Label1': np.random.randint(0, 2, n_samples),
'Label2': np.random.randint(0, 2, n_samples),
'Label3': np.random.randint(0, 2, n_samples),
})
features, _ = extract_tfidf_features(df, max_features=500)
labels = prepare_labels(df)
assert features.shape[0] == n_samples
assert labels.shape[0] == n_samples
assert features.shape[1] <= 500
def test_pipeline_with_many_labels(self):
"""Test pipeline with many label columns."""
n_labels = 50
df = pd.DataFrame({
'issue text': ['text1', 'text2', 'text3'],
'issue description': ['desc1', 'desc2', 'desc3'],
})
# Add many label columns
for i in range(n_labels):
df[f'Label_{i}'] = np.random.randint(0, 2, 3)
labels = prepare_labels(df)
assert labels.shape[1] == n_labels
assert set(labels.values.flatten()).issubset({0, 1})
@pytest.mark.integration
class TestSaveAndLoadIntegration:
"""Integration tests for saving and loading processed data."""
def test_save_and_load_features(self, temp_db):
"""Test saving features and labels then loading them back."""
with tempfile.TemporaryDirectory() as tmpdir:
from hopcroft_skill_classification_tool_competition.features import (
create_feature_dataset,
load_processed_data
)
# Mock the PROCESSED_DATA_DIR
with pytest.MonkeyPatch.context() as m:
tmpdir_path = Path(tmpdir)
tfidf_dir = tmpdir_path / "tfidf"
tfidf_dir.mkdir(parents=True)
# Create and save
features_orig, labels_orig, _, _ = create_feature_dataset(
db_path=temp_db,
save_processed=True
)
# Save manually since we're mocking
np.save(tfidf_dir / "features_tfidf.npy", features_orig)
np.save(tfidf_dir / "labels_tfidf.npy", labels_orig.values)
# Load back
features_loaded = np.load(tfidf_dir / "features_tfidf.npy")
labels_loaded = np.load(tfidf_dir / "labels_tfidf.npy")
# Verify they match
np.testing.assert_array_equal(features_orig, features_loaded)
np.testing.assert_array_equal(labels_orig.values, labels_loaded)
@pytest.mark.integration
class TestErrorHandlingInPipeline:
"""Integration tests for error handling throughout pipeline."""
def test_pipeline_with_missing_columns(self):
"""Test pipeline behavior with missing expected columns."""
df = pd.DataFrame({
'wrong_col_1': ['text1', 'text2'],
'wrong_col_2': ['desc1', 'desc2'],
'Label1': [1, 0],
})
# Should handle missing text columns gracefully
text_cols = get_text_columns(df)
assert len(text_cols) == 0
# Should still work with explicit column specification
# (though results may not be meaningful)
with pytest.raises(ValueError, match="No text columns found"):
extract_tfidf_features(df)
def test_pipeline_with_all_nan_text(self):
"""Test pipeline with all NaN text values raises appropriate error.
TF-IDF cannot build a vocabulary from empty/NaN documents,
so it should raise a ValueError with a descriptive message.
"""
df = pd.DataFrame({
'issue text': [None, None, None],
'issue description': [None, None, None],
'Label1': [1, 0, 1],
})
# TF-IDF should raise ValueError for empty vocabulary
with pytest.raises(ValueError, match="empty vocabulary"):
extract_tfidf_features(df, max_features=50)
def test_pipeline_with_empty_labels(self):
"""Test pipeline when no labels are present."""
df = pd.DataFrame({
'issue text': ['text1', 'text2'],
'issue description': ['desc1', 'desc2'],
# No label columns
})
label_cols = get_label_columns(df)
# Should return empty list
assert len(label_cols) == 0