Hopcroft-Skill-Classification

Sleeping

Hopcroft-Skill-Classification / tests /integration /test_feature_pipeline.py

DaCrow13

Deploy to HF Spaces (Clean)

39d224b about 2 months ago

11.5 kB

	"""
	Integration tests for the feature extraction pipeline.

	Tests the combined functionality of dataset loading, text processing,
	and feature extraction working together.
	"""
	import pytest
	import numpy as np
	import pandas as pd
	import tempfile
	import sqlite3
	from pathlib import Path

	from hopcroft_skill_classification_tool_competition.features import (
	load_data_from_db,
	create_feature_dataset,
	extract_tfidf_features,
	prepare_labels,
	get_text_columns,
	get_label_columns,
	)


	@pytest.mark.integration
	class TestFeatureExtractionPipeline:
	"""Integration tests for complete feature extraction pipeline."""

	def test_full_pipeline_from_dataframe_to_features(self, sample_dataframe):
	"""Test complete pipeline from DataFrame to features and labels."""
	# Extract features
	features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50)

	# Prepare labels
	labels = prepare_labels(sample_dataframe)

	# Verify alignment
	assert features.shape[0] == len(labels)
	assert features.shape[0] == len(sample_dataframe)

	# Verify data types
	assert isinstance(features, np.ndarray)
	assert isinstance(labels, pd.DataFrame)

	# Verify no NaN or Inf values
	assert not np.any(np.isnan(features))
	assert not np.any(np.isinf(features))
	assert not labels.isnull().any().any()

	def test_pipeline_with_database_to_features(self, temp_db):
	"""Test pipeline from database loading to feature extraction."""
	# Load from database
	df = load_data_from_db(temp_db)

	# Extract features
	features, vectorizer = extract_tfidf_features(df, max_features=50)

	# Prepare labels
	labels = prepare_labels(df)

	# Verify complete pipeline
	assert features.shape[0] == len(df)
	assert labels.shape[0] == len(df)
	assert features.shape[0] == labels.shape[0]

	def test_create_feature_dataset_integration(self, temp_db):
	"""Test the complete create_feature_dataset function."""
	features, labels, feature_names, label_names = create_feature_dataset(
	db_path=temp_db,
	save_processed=False
	)

	# Verify outputs
	assert isinstance(features, np.ndarray)
	assert isinstance(labels, pd.DataFrame)
	assert isinstance(feature_names, np.ndarray) # sklearn returns ndarray
	assert isinstance(label_names, list)

	# Verify shapes match
	assert features.shape[0] == labels.shape[0]
	assert features.shape[1] == len(feature_names)
	assert labels.shape[1] == len(label_names)

	def test_pipeline_preserves_sample_count(self, sample_dataframe):
	"""Test that no samples are lost during pipeline."""
	initial_count = len(sample_dataframe)

	features, _ = extract_tfidf_features(sample_dataframe, max_features=50)
	labels = prepare_labels(sample_dataframe)

	assert features.shape[0] == initial_count
	assert labels.shape[0] == initial_count

	def test_pipeline_with_various_text_lengths(self):
	"""Test pipeline with documents of varying lengths."""
	df = pd.DataFrame({
	'issue text': [
	'short',
	'This is a medium length text with several words',
	'This is a very long text ' * 50, # Very long
	],
	'issue description': ['desc1', 'desc2', 'desc3'],
	'Label1': [1, 0, 1],
	'Label2': [0, 1, 1],
	})

	features, _ = extract_tfidf_features(df, max_features=50)
	labels = prepare_labels(df)

	# All documents should be processed
	assert features.shape[0] == 3
	assert labels.shape[0] == 3

	# Features should have reasonable values
	assert not np.all(features == 0)


	@pytest.mark.integration
	class TestDataFlowConsistency:
	"""Integration tests for data consistency through the pipeline."""

	def test_text_cleaning_affects_features(self, sample_dataframe):
	"""Test that text cleaning impacts feature extraction."""
	# Add dirty text
	dirty_df = sample_dataframe.copy()
	dirty_df['issue text'] = [
	"Bug https://example.com with <b>HTML</b>",
	"Feature with ```code block```",
	"Update with extra spaces",
	"Test with 😀 emoji",
	"Normal clean text",
	]

	features_dirty, _ = extract_tfidf_features(dirty_df, max_features=50)

	# Clean version
	clean_df = sample_dataframe.copy()
	clean_df['issue text'] = [
	"Bug with HTML",
	"Feature with",
	"Update with extra spaces",
	"Test with emoji",
	"Normal clean text",
	]

	features_clean, _ = extract_tfidf_features(clean_df, max_features=50)

	# Features should be similar (cleaning is applied to both)
	# But not necessarily identical due to stemming
	assert features_dirty.shape == features_clean.shape

	def test_label_binarization_consistency(self):
	"""Test that label binarization is consistent."""
	df = pd.DataFrame({
	'issue text': ['text1', 'text2', 'text3'],
	'issue description': ['desc1', 'desc2', 'desc3'],
	'Label1': [0, 5, 10], # Different counts
	'Label2': [1, 0, 100],
	})

	labels = prepare_labels(df)

	# All values should be 0 or 1
	assert set(labels.values.flatten()).issubset({0, 1})

	# Specific checks
	assert labels.loc[0, 'Label1'] == 0
	assert labels.loc[1, 'Label1'] == 1
	assert labels.loc[2, 'Label1'] == 1
	assert labels.loc[0, 'Label2'] == 1
	assert labels.loc[1, 'Label2'] == 0
	assert labels.loc[2, 'Label2'] == 1

	def test_feature_label_alignment(self, sample_dataframe):
	"""Test that features and labels remain aligned."""
	features, _ = extract_tfidf_features(sample_dataframe, max_features=50)
	labels = prepare_labels(sample_dataframe)

	# Check alignment by comparing indices
	for i in range(len(sample_dataframe)):
	# Each row should correspond to the same sample
	assert features[i].shape[0] > 0 # Has features
	assert labels.iloc[i].shape[0] > 0 # Has labels


	@pytest.mark.integration
	@pytest.mark.slow
	class TestLargeDatasetHandling:
	"""Integration tests with larger datasets (marked as slow)."""

	def test_pipeline_with_large_dataset(self):
	"""Test pipeline with a larger number of samples."""
	# Create larger dataset
	n_samples = 1000
	df = pd.DataFrame({
	'issue text': [f'Issue number {i} with some text' for i in range(n_samples)],
	'issue description': [f'Description for issue {i}' for i in range(n_samples)],
	'Label1': np.random.randint(0, 2, n_samples),
	'Label2': np.random.randint(0, 2, n_samples),
	'Label3': np.random.randint(0, 2, n_samples),
	})

	features, _ = extract_tfidf_features(df, max_features=500)
	labels = prepare_labels(df)

	assert features.shape[0] == n_samples
	assert labels.shape[0] == n_samples
	assert features.shape[1] <= 500

	def test_pipeline_with_many_labels(self):
	"""Test pipeline with many label columns."""
	n_labels = 50
	df = pd.DataFrame({
	'issue text': ['text1', 'text2', 'text3'],
	'issue description': ['desc1', 'desc2', 'desc3'],
	})

	# Add many label columns
	for i in range(n_labels):
	df[f'Label_{i}'] = np.random.randint(0, 2, 3)

	labels = prepare_labels(df)

	assert labels.shape[1] == n_labels
	assert set(labels.values.flatten()).issubset({0, 1})


	@pytest.mark.integration
	class TestSaveAndLoadIntegration:
	"""Integration tests for saving and loading processed data."""

	def test_save_and_load_features(self, temp_db):
	"""Test saving features and labels then loading them back."""
	with tempfile.TemporaryDirectory() as tmpdir:
	from hopcroft_skill_classification_tool_competition.features import (
	create_feature_dataset,
	load_processed_data
	)

	# Mock the PROCESSED_DATA_DIR
	with pytest.MonkeyPatch.context() as m:
	tmpdir_path = Path(tmpdir)
	tfidf_dir = tmpdir_path / "tfidf"
	tfidf_dir.mkdir(parents=True)

	# Create and save
	features_orig, labels_orig, _, _ = create_feature_dataset(
	db_path=temp_db,
	save_processed=True
	)

	# Save manually since we're mocking
	np.save(tfidf_dir / "features_tfidf.npy", features_orig)
	np.save(tfidf_dir / "labels_tfidf.npy", labels_orig.values)

	# Load back
	features_loaded = np.load(tfidf_dir / "features_tfidf.npy")
	labels_loaded = np.load(tfidf_dir / "labels_tfidf.npy")

	# Verify they match
	np.testing.assert_array_equal(features_orig, features_loaded)
	np.testing.assert_array_equal(labels_orig.values, labels_loaded)


	@pytest.mark.integration
	class TestErrorHandlingInPipeline:
	"""Integration tests for error handling throughout pipeline."""

	def test_pipeline_with_missing_columns(self):
	"""Test pipeline behavior with missing expected columns."""
	df = pd.DataFrame({
	'wrong_col_1': ['text1', 'text2'],
	'wrong_col_2': ['desc1', 'desc2'],
	'Label1': [1, 0],
	})

	# Should handle missing text columns gracefully
	text_cols = get_text_columns(df)
	assert len(text_cols) == 0

	# Should still work with explicit column specification
	# (though results may not be meaningful)
	with pytest.raises(ValueError, match="No text columns found"):
	extract_tfidf_features(df)

	def test_pipeline_with_all_nan_text(self):
	"""Test pipeline with all NaN text values."""
	df = pd.DataFrame({
	'issue text': [None, None, None],
	'issue description': [None, None, None],
	'Label1': [1, 0, 1],
	})

	# Should handle NaN values without crashing
	features, _ = extract_tfidf_features(df, max_features=50)

	# May result in zero features for all samples
	assert features.shape[0] == 3
	assert not np.any(np.isnan(features))

	def test_pipeline_with_empty_labels(self):
	"""Test pipeline when no labels are present."""
	df = pd.DataFrame({
	'issue text': ['text1', 'text2'],
	'issue description': ['desc1', 'desc2'],
	# No label columns
	})

	label_cols = get_label_columns(df)

	# Should return empty list
	assert len(label_cols) == 0