""" Root pytest configuration and shared fixtures. This module provides fixtures that are available to all test modules. """ import pytest import numpy as np import pandas as pd import tempfile import sqlite3 from pathlib import Path from sklearn.feature_extraction.text import TfidfVectorizer @pytest.fixture def sample_text_data(): """Fixture providing sample text data for testing.""" return [ "Fixed bug in authentication system using OAuth2", "Implemented REST API endpoint for user data retrieval", "Added unit tests for data processing pipeline", "Refactored code to improve performance and reduce memory usage", "Updated database schema with new migration scripts", ] @pytest.fixture def sample_dirty_text(): """Fixture providing text with common GitHub noise.""" return [ "Fixed bug https://github.com/repo/issues/123 in auth system", "Added feature with HTML tags and `inline code`", "Removed emoji 😀 and special characters", """Updated docs with code block: ```python def foo(): pass ``` """, "Fixed multiple spaces and\n\nnewlines", ] @pytest.fixture def sample_labels(): """Fixture providing sample multi-label data.""" return pd.DataFrame({ 'Language': [1, 1, 1, 0, 1], 'Data Structure': [1, 0, 0, 1, 1], 'Testing': [0, 0, 1, 0, 0], 'API': [1, 1, 0, 0, 0], 'DevOps': [0, 0, 0, 1, 1], }) @pytest.fixture def sample_dataframe(sample_text_data, sample_labels): """Fixture providing complete sample dataframe.""" df = pd.DataFrame({ 'Repo Name': ['repo1', 'repo2', 'repo1', 'repo3', 'repo2'], 'PR #': [1, 2, 3, 4, 5], 'issue text': [sample_text_data[0], sample_text_data[1], sample_text_data[2], sample_text_data[3], sample_text_data[4]], 'issue description': ['Description for issue 1', 'Description for issue 2', 'Description for issue 3', 'Description for issue 4', 'Description for issue 5'], }) # Add label columns for col in sample_labels.columns: df[col] = sample_labels[col].values return df @pytest.fixture def temp_db(sample_dataframe): """Fixture providing temporary SQLite database.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.db', delete=False) as f: db_path = f.name # Create database and insert data conn = sqlite3.connect(db_path) sample_dataframe.to_sql('nlbse_tool_competition_data_by_issue', conn, if_exists='replace', index=False) conn.close() yield Path(db_path) # Cleanup Path(db_path).unlink() @pytest.fixture def sample_tfidf_vectorizer(): """Fixture providing a simple TF-IDF vectorizer.""" vectorizer = TfidfVectorizer( max_features=100, ngram_range=(1, 2), stop_words='english' ) return vectorizer @pytest.fixture def sample_sparse_features(): """Fixture providing sample sparse feature matrix.""" # Create a sparse matrix (mostly zeros) features = np.zeros((100, 50)) # Add some non-zero values for i in range(100): # Each row has 5-10 non-zero features n_nonzero = np.random.randint(5, 11) indices = np.random.choice(50, n_nonzero, replace=False) features[i, indices] = np.random.rand(n_nonzero) return features @pytest.fixture def sample_multilabel_data(): """Fixture providing sample multi-label classification data.""" n_samples = 100 n_labels = 10 # Generate labels with varying frequencies labels = np.zeros((n_samples, n_labels), dtype=int) for i in range(n_samples): # Each sample has 1-5 labels n_labels_per_sample = np.random.randint(1, 6) label_indices = np.random.choice(n_labels, n_labels_per_sample, replace=False) labels[i, label_indices] = 1 return labels @pytest.fixture def empty_text_samples(): """Fixture providing edge case: empty or null text samples.""" return [ "", None, " ", "\n\n\n", "a", # Single character ]