Hopcroft-Skill-Classification

Sleeping

Hopcroft-Skill-Classification / tests /unit /test_features.py

DaCrow13

Deploy to HF Spaces (Clean)

39d224b about 2 months ago

16.9 kB

	"""
	Unit tests for features.py module.

	Tests individual functions for text cleaning, feature extraction,
	and label preparation.
	"""
	import pytest
	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer

	from hopcroft_skill_classification_tool_competition.features import (
	clean_github_text,
	get_text_columns,
	get_label_columns,
	combine_text_fields,
	extract_tfidf_features,
	prepare_labels,
	get_dataset_info,
	load_data_from_db,
	)


	@pytest.mark.unit
	class TestTextCleaning:
	"""Unit tests for text cleaning functionality."""

	def test_clean_github_text_removes_urls(self):
	"""Test that URLs are removed from text."""
	text = "Fixed bug https://github.com/repo/issues/123 in authentication"
	cleaned = clean_github_text(text)

	assert "https://" not in cleaned
	assert "github.com" not in cleaned
	assert "fix" in cleaned.lower() # Stemmed version of "fixed"
	assert "authent" in cleaned.lower() # Stemmed version

	def test_clean_github_text_removes_html(self):
	"""Test that HTML tags are removed."""
	text = "Added <b>bold</b> feature with <i>italic</i> text"
	cleaned = clean_github_text(text)

	assert "<b>" not in cleaned
	assert "<i>" not in cleaned
	assert "bold" in cleaned.lower()
	# After stemming, "italic" becomes "ital"
	assert "ital" in cleaned.lower()

	def test_clean_github_text_removes_code_blocks(self):
	"""Test that markdown code blocks are removed."""
	text = """Fixed bug in code:
	```python
	def foo():
	pass
	```
	"""
	cleaned = clean_github_text(text)

	assert "```" not in cleaned
	assert "python" not in cleaned
	assert "def" not in cleaned
	assert "fix" in cleaned.lower()

	def test_clean_github_text_removes_inline_code(self):
	"""Test that inline code markers are removed."""
	text = "Updated `getUserById()` method implementation"
	cleaned = clean_github_text(text)

	assert "`" not in cleaned
	assert "method" in cleaned.lower()

	def test_clean_github_text_normalizes_whitespace(self):
	"""Test that extra whitespace is normalized."""
	text = "Fixed multiple spaces and\n\n\nnewlines"
	cleaned = clean_github_text(text)

	assert " " not in cleaned
	assert "\n\n" not in cleaned
	# Should be single spaces
	words = cleaned.split()
	assert len(words) == len([w for w in words if w]) # No empty strings

	@pytest.mark.parametrize("text,expected_empty", [
	("", True),
	(None, True),
	(" ", True),
	("\n\n", True),
	("a", False),
	])
	def test_clean_github_text_empty_inputs(self, text, expected_empty):
	"""Test handling of empty or null inputs."""
	cleaned = clean_github_text(text)
	assert isinstance(cleaned, str)

	if expected_empty:
	assert cleaned == "" or cleaned.isspace()
	else:
	assert len(cleaned) > 0

	def test_clean_github_text_applies_stemming(self):
	"""Test that stemming is applied to words."""
	text = "running walked swimming"
	cleaned = clean_github_text(text)

	# Porter stemmer should convert to stems
	assert "run" in cleaned.lower() # running -> run
	assert "walk" in cleaned.lower() # walked -> walk
	assert "swim" in cleaned.lower() # swimming -> swim

	def test_clean_github_text_removes_emojis(self):
	"""Test that emojis and non-ASCII characters are removed."""
	text = "Fixed bug 😀 with special chars"
	cleaned = clean_github_text(text)

	# Should only contain ASCII
	assert cleaned.isascii()
	assert "fix" in cleaned.lower()


	@pytest.mark.unit
	class TestColumnIdentification:
	"""Unit tests for column identification functions."""

	def test_get_text_columns_identifies_correctly(self, sample_dataframe):
	"""Test that text columns are correctly identified."""
	text_cols = get_text_columns(sample_dataframe)

	assert 'issue text' in text_cols
	assert 'issue description' in text_cols
	assert len(text_cols) == 2

	def test_get_text_columns_handles_missing_columns(self):
	"""Test handling when text columns are missing."""
	df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
	text_cols = get_text_columns(df)

	assert isinstance(text_cols, list)
	assert len(text_cols) == 0 # No standard text columns found

	def test_get_label_columns_identifies_correctly(self, sample_dataframe):
	"""Test that label columns are correctly identified."""
	label_cols = get_label_columns(sample_dataframe)

	# Should exclude metadata columns
	assert 'Repo Name' not in label_cols
	assert 'PR #' not in label_cols
	assert 'issue text' not in label_cols
	assert 'issue description' not in label_cols

	# Should include label columns
	assert 'Language' in label_cols
	assert 'Data Structure' in label_cols
	assert 'Testing' in label_cols

	def test_get_label_columns_only_numeric(self, sample_dataframe):
	"""Test that only numeric columns are identified as labels."""
	label_cols = get_label_columns(sample_dataframe)

	# All label columns should be numeric
	for col in label_cols:
	assert pd.api.types.is_numeric_dtype(sample_dataframe[col])


	@pytest.mark.unit
	class TestTextCombination:
	"""Unit tests for text combination functionality."""

	def test_combine_text_fields_combines_correctly(self, sample_dataframe):
	"""Test that multiple text fields are combined."""
	text_cols = ['issue text', 'issue description']
	combined = combine_text_fields(sample_dataframe, text_cols)

	assert len(combined) == len(sample_dataframe)
	assert isinstance(combined, pd.Series)

	# Check that both columns are present
	for i, text in enumerate(combined):
	assert isinstance(text, str)
	# Should contain content from both columns (stemmed)
	assert len(text) > 0

	def test_combine_text_fields_applies_cleaning(self, sample_dataframe):
	"""Test that cleaning is applied during combination."""
	# Add dirty text
	sample_dataframe['issue text'] = [
	"Fixed https://example.com bug",
	"Added feature",
	"Updated docs",
	"Refactored code",
	"Improved tests"
	]

	text_cols = ['issue text']
	combined = combine_text_fields(sample_dataframe, text_cols)

	# URLs should be removed
	for text in combined:
	assert "https://" not in text
	assert "example.com" not in text

	def test_combine_text_fields_handles_nulls(self):
	"""Test handling of null values in text fields."""
	df = pd.DataFrame({
	'text1': ['hello', None, 'world'],
	'text2': [None, 'foo', 'bar']
	})

	combined = combine_text_fields(df, ['text1', 'text2'])

	assert len(combined) == 3
	# Should not raise error and should handle nulls gracefully
	for text in combined:
	assert isinstance(text, str)


	@pytest.mark.unit
	class TestTfidfExtraction:
	"""Unit tests for TF-IDF feature extraction."""

	def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe):
	"""Test that TF-IDF extraction returns correct shape."""
	features, vectorizer = extract_tfidf_features(
	sample_dataframe,
	max_features=50
	)

	assert features.shape[0] == len(sample_dataframe)
	assert features.shape[1] <= 50 # May be less if vocabulary is small
	assert isinstance(vectorizer, TfidfVectorizer)

	def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe):
	"""Test that features are returned as numpy array."""
	features, _ = extract_tfidf_features(sample_dataframe)

	assert isinstance(features, np.ndarray)
	assert features.dtype == np.float64 or features.dtype == np.float32

	@pytest.mark.parametrize("max_features", [10, 50, 100, None])
	def test_extract_tfidf_features_respects_max_features(
	self, sample_dataframe, max_features
	):
	"""Test that max_features parameter is respected."""
	features, _ = extract_tfidf_features(
	sample_dataframe,
	max_features=max_features
	)

	if max_features is not None:
	assert features.shape[1] <= max_features

	@pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)])
	def test_extract_tfidf_features_ngram_range(
	self, sample_dataframe, ngram_range
	):
	"""Test different n-gram ranges."""
	features, vectorizer = extract_tfidf_features(
	sample_dataframe,
	ngram_range=ngram_range,
	max_features=50
	)

	assert features.shape[0] == len(sample_dataframe)
	vocab = vectorizer.get_feature_names_out()

	# Check that n-grams are present if range includes them
	if ngram_range[1] > 1:
	# Should have some bigrams (words with space)
	bigrams = [term for term in vocab if ' ' in term]
	assert len(bigrams) > 0 or len(vocab) < 50 # May not have bigrams if vocab is small

	def test_extract_tfidf_features_handles_empty_text(self):
	"""Test handling of documents with empty text."""
	df = pd.DataFrame({
	'issue text': ['', 'valid text', ' '],
	'issue description': ['desc', '', 'another desc']
	})

	features, vectorizer = extract_tfidf_features(df, max_features=50)

	# Should not raise error
	assert features.shape[0] == 3
	assert not np.any(np.isnan(features))
	assert not np.any(np.isinf(features))


	@pytest.mark.unit
	class TestLabelPreparation:
	"""Unit tests for label preparation."""

	def test_prepare_labels_returns_binary(self, sample_dataframe):
	"""Test that labels are converted to binary format."""
	labels = prepare_labels(sample_dataframe)

	# Should only contain 0 and 1
	unique_values = np.unique(labels.values)
	assert set(unique_values).issubset({0, 1})

	def test_prepare_labels_correct_shape(self, sample_dataframe):
	"""Test that label matrix has correct shape."""
	label_cols = get_label_columns(sample_dataframe)
	labels = prepare_labels(sample_dataframe)

	assert labels.shape[0] == len(sample_dataframe)
	assert labels.shape[1] == len(label_cols)

	def test_prepare_labels_converts_counts_to_binary(self):
	"""Test that label counts > 0 are converted to 1."""
	df = pd.DataFrame({
	'Repo Name': ['repo1', 'repo2'],
	'issue text': ['text1', 'text2'],
	'Label1': [0, 5], # 5 should become 1
	'Label2': [3, 0], # 3 should become 1
	'Label3': [0, 0],
	})

	labels = prepare_labels(df)

	assert labels.loc[0, 'Label1'] == 0
	assert labels.loc[0, 'Label2'] == 1
	assert labels.loc[1, 'Label1'] == 1
	assert labels.loc[1, 'Label2'] == 0

	def test_prepare_labels_preserves_column_names(self, sample_dataframe):
	"""Test that label column names are preserved."""
	label_cols = get_label_columns(sample_dataframe)
	labels = prepare_labels(sample_dataframe)

	assert list(labels.columns) == label_cols


	@pytest.mark.unit
	class TestDatasetInfo:
	"""Unit tests for dataset information extraction."""

	def test_get_dataset_info_returns_dict(self, sample_dataframe):
	"""Test that dataset info returns a dictionary."""
	info = get_dataset_info(sample_dataframe)

	assert isinstance(info, dict)

	def test_get_dataset_info_contains_required_keys(self, sample_dataframe):
	"""Test that all required keys are present."""
	info = get_dataset_info(sample_dataframe)

	required_keys = [
	'total_issues', 'total_columns', 'text_columns',
	'num_text_columns', 'label_columns', 'num_labels',
	'avg_labels_per_issue', 'median_labels_per_issue'
	]

	for key in required_keys:
	assert key in info

	def test_get_dataset_info_correct_counts(self, sample_dataframe):
	"""Test that counts are calculated correctly."""
	info = get_dataset_info(sample_dataframe)

	assert info['total_issues'] == len(sample_dataframe)
	assert info['total_columns'] == len(sample_dataframe.columns)
	assert info['num_text_columns'] == 2 # issue text and description

	def test_get_dataset_info_label_statistics(self, sample_dataframe):
	"""Test label statistics are reasonable."""
	info = get_dataset_info(sample_dataframe)

	assert info['avg_labels_per_issue'] >= 0
	assert info['median_labels_per_issue'] >= 0
	assert info['avg_labels_per_issue'] <= info['num_labels']


	@pytest.mark.unit
	@pytest.mark.requires_data
	class TestDatabaseLoading:
	"""Unit tests for database loading (requires temp DB)."""

	def test_load_data_from_db_returns_dataframe(self, temp_db):
	"""Test that loading from DB returns a DataFrame."""
	df = load_data_from_db(temp_db)

	assert isinstance(df, pd.DataFrame)
	assert len(df) > 0

	def test_load_data_from_db_contains_expected_columns(self, temp_db):
	"""Test that loaded data has expected columns."""
	df = load_data_from_db(temp_db)

	assert 'issue text' in df.columns
	assert 'issue description' in df.columns
	assert 'Repo Name' in df.columns
	assert 'PR #' in df.columns

	def test_load_data_from_db_nonexistent_file(self):
	"""Test handling of nonexistent database file."""
	from pathlib import Path

	with pytest.raises(Exception): # Could be FileNotFoundError or sqlite3 error
	load_data_from_db(Path("/nonexistent/path/to/db.db"))


	@pytest.mark.unit
	class TestEdgeCases:
	"""Unit tests for edge cases and error handling."""

	def test_extract_tfidf_with_single_document(self):
	"""Test TF-IDF extraction with only one document."""
	df = pd.DataFrame({
	'issue text': ['Single document for testing'],
	'issue description': ['Description'],
	'Label1': [1]
	})

	# Must set min_df=1 for single document
	features, vectorizer = extract_tfidf_features(
	df,
	max_features=50,
	min_df=1,
	max_df=1.0
	)

	assert features.shape[0] == 1
	assert features.shape[1] > 0

	def test_extract_tfidf_with_identical_documents(self):
	"""Test TF-IDF with identical documents."""
	df = pd.DataFrame({
	'issue text': ['Same text'] * 3,
	'issue description': ['Same description'] * 3,
	'Label1': [1, 0, 1]
	})

	# Must set max_df=1.0 because all docs are identical (100% frequency)
	# Must set min_df=1 to ensure terms are kept even if they appear in all docs
	features, _ = extract_tfidf_features(
	df,
	max_features=50,
	min_df=1,
	max_df=1.0
	)

	# All documents should have similar (but not necessarily identical) features
	assert features.shape[0] == 3
	assert not np.all(features == 0)

	def test_prepare_labels_with_all_zeros(self):
	"""Test label preparation when a label has all zeros."""
	df = pd.DataFrame({
	'issue text': ['text1', 'text2'],
	'Label1': [0, 0], # All zeros
	'Label2': [1, 1],
	})

	labels = prepare_labels(df)

	assert labels['Label1'].sum() == 0
	assert labels['Label2'].sum() == 2

	def test_clean_text_with_only_special_characters(self):
	"""Test cleaning text that contains only special characters."""
	text = "!@#$%^&*()"
	cleaned = clean_github_text(text)

	# Should handle gracefully (may be empty or contain only ASCII equivalents)
	assert isinstance(cleaned, str)