Spaces:
Sleeping
Sleeping
| """ | |
| Root pytest configuration and shared fixtures. | |
| This module provides fixtures that are available to all test modules. | |
| """ | |
| import pytest | |
| import numpy as np | |
| import pandas as pd | |
| import tempfile | |
| import sqlite3 | |
| from pathlib import Path | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| def sample_text_data(): | |
| """Fixture providing sample text data for testing.""" | |
| return [ | |
| "Fixed bug in authentication system using OAuth2", | |
| "Implemented REST API endpoint for user data retrieval", | |
| "Added unit tests for data processing pipeline", | |
| "Refactored code to improve performance and reduce memory usage", | |
| "Updated database schema with new migration scripts", | |
| ] | |
| def sample_dirty_text(): | |
| """Fixture providing text with common GitHub noise.""" | |
| return [ | |
| "Fixed bug https://github.com/repo/issues/123 in auth system", | |
| "Added feature with <b>HTML tags</b> and `inline code`", | |
| "Removed emoji 😀 and special characters", | |
| """Updated docs with code block: | |
| ```python | |
| def foo(): | |
| pass | |
| ``` | |
| """, | |
| "Fixed multiple spaces and\n\nnewlines", | |
| ] | |
| def sample_labels(): | |
| """Fixture providing sample multi-label data.""" | |
| return pd.DataFrame({ | |
| 'Language': [1, 1, 1, 0, 1], | |
| 'Data Structure': [1, 0, 0, 1, 1], | |
| 'Testing': [0, 0, 1, 0, 0], | |
| 'API': [1, 1, 0, 0, 0], | |
| 'DevOps': [0, 0, 0, 1, 1], | |
| }) | |
| def sample_dataframe(sample_text_data, sample_labels): | |
| """Fixture providing complete sample dataframe.""" | |
| df = pd.DataFrame({ | |
| 'Repo Name': ['repo1', 'repo2', 'repo1', 'repo3', 'repo2'], | |
| 'PR #': [1, 2, 3, 4, 5], | |
| 'issue text': [sample_text_data[0], sample_text_data[1], | |
| sample_text_data[2], sample_text_data[3], | |
| sample_text_data[4]], | |
| 'issue description': ['Description for issue 1', 'Description for issue 2', | |
| 'Description for issue 3', 'Description for issue 4', | |
| 'Description for issue 5'], | |
| }) | |
| # Add label columns | |
| for col in sample_labels.columns: | |
| df[col] = sample_labels[col].values | |
| return df | |
| def temp_db(sample_dataframe): | |
| """Fixture providing temporary SQLite database.""" | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.db', delete=False) as f: | |
| db_path = f.name | |
| # Create database and insert data | |
| conn = sqlite3.connect(db_path) | |
| sample_dataframe.to_sql('nlbse_tool_competition_data_by_issue', | |
| conn, if_exists='replace', index=False) | |
| conn.close() | |
| yield Path(db_path) | |
| # Cleanup | |
| Path(db_path).unlink() | |
| def sample_tfidf_vectorizer(): | |
| """Fixture providing a simple TF-IDF vectorizer.""" | |
| vectorizer = TfidfVectorizer( | |
| max_features=100, | |
| ngram_range=(1, 2), | |
| stop_words='english' | |
| ) | |
| return vectorizer | |
| def sample_sparse_features(): | |
| """Fixture providing sample sparse feature matrix.""" | |
| # Create a sparse matrix (mostly zeros) | |
| features = np.zeros((100, 50)) | |
| # Add some non-zero values | |
| for i in range(100): | |
| # Each row has 5-10 non-zero features | |
| n_nonzero = np.random.randint(5, 11) | |
| indices = np.random.choice(50, n_nonzero, replace=False) | |
| features[i, indices] = np.random.rand(n_nonzero) | |
| return features | |
| def sample_multilabel_data(): | |
| """Fixture providing sample multi-label classification data.""" | |
| n_samples = 100 | |
| n_labels = 10 | |
| # Generate labels with varying frequencies | |
| labels = np.zeros((n_samples, n_labels), dtype=int) | |
| for i in range(n_samples): | |
| # Each sample has 1-5 labels | |
| n_labels_per_sample = np.random.randint(1, 6) | |
| label_indices = np.random.choice(n_labels, n_labels_per_sample, replace=False) | |
| labels[i, label_indices] = 1 | |
| return labels | |
| def empty_text_samples(): | |
| """Fixture providing edge case: empty or null text samples.""" | |
| return [ | |
| "", | |
| None, | |
| " ", | |
| "\n\n\n", | |
| "a", # Single character | |
| ] | |