Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 4,328 Bytes

225af6a

"""
Root pytest configuration and shared fixtures.

This module provides fixtures that are available to all test modules.
"""
import pytest
import numpy as np
import pandas as pd
import tempfile
import sqlite3
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer


@pytest.fixture
def sample_text_data():
    """Fixture providing sample text data for testing."""
    return [
        "Fixed bug in authentication system using OAuth2",
        "Implemented REST API endpoint for user data retrieval",
        "Added unit tests for data processing pipeline",
        "Refactored code to improve performance and reduce memory usage",
        "Updated database schema with new migration scripts",
    ]


@pytest.fixture
def sample_dirty_text():
    """Fixture providing text with common GitHub noise."""
    return [
        "Fixed bug https://github.com/repo/issues/123 in auth system",
        "Added feature with <b>HTML tags</b> and `inline code`",
        "Removed emoji 😀 and special characters",
        """Updated docs with code block:
        ```python
        def foo():
            pass
        ```
        """,
        "Fixed    multiple   spaces   and\n\nnewlines",
    ]


@pytest.fixture
def sample_labels():
    """Fixture providing sample multi-label data."""
    return pd.DataFrame({
        'Language': [1, 1, 1, 0, 1],
        'Data Structure': [1, 0, 0, 1, 1],
        'Testing': [0, 0, 1, 0, 0],
        'API': [1, 1, 0, 0, 0],
        'DevOps': [0, 0, 0, 1, 1],
    })


@pytest.fixture
def sample_dataframe(sample_text_data, sample_labels):
    """Fixture providing complete sample dataframe."""
    df = pd.DataFrame({
        'Repo Name': ['repo1', 'repo2', 'repo1', 'repo3', 'repo2'],
        'PR #': [1, 2, 3, 4, 5],
        'issue text': [sample_text_data[0], sample_text_data[1], 
                       sample_text_data[2], sample_text_data[3], 
                       sample_text_data[4]],
        'issue description': ['Description for issue 1', 'Description for issue 2',
                              'Description for issue 3', 'Description for issue 4',
                              'Description for issue 5'],
    })
    
    # Add label columns
    for col in sample_labels.columns:
        df[col] = sample_labels[col].values
    
    return df


@pytest.fixture
def temp_db(sample_dataframe):
    """Fixture providing temporary SQLite database."""
    with tempfile.NamedTemporaryFile(mode='w', suffix='.db', delete=False) as f:
        db_path = f.name
    
    # Create database and insert data
    conn = sqlite3.connect(db_path)
    sample_dataframe.to_sql('nlbse_tool_competition_data_by_issue', 
                            conn, if_exists='replace', index=False)
    conn.close()
    
    yield Path(db_path)
    
    # Cleanup
    Path(db_path).unlink()


@pytest.fixture
def sample_tfidf_vectorizer():
    """Fixture providing a simple TF-IDF vectorizer."""
    vectorizer = TfidfVectorizer(
        max_features=100,
        ngram_range=(1, 2),
        stop_words='english'
    )
    return vectorizer


@pytest.fixture
def sample_sparse_features():
    """Fixture providing sample sparse feature matrix."""
    # Create a sparse matrix (mostly zeros)
    features = np.zeros((100, 50))
    
    # Add some non-zero values
    for i in range(100):
        # Each row has 5-10 non-zero features
        n_nonzero = np.random.randint(5, 11)
        indices = np.random.choice(50, n_nonzero, replace=False)
        features[i, indices] = np.random.rand(n_nonzero)
    
    return features


@pytest.fixture
def sample_multilabel_data():
    """Fixture providing sample multi-label classification data."""
    n_samples = 100
    n_labels = 10
    
    # Generate labels with varying frequencies
    labels = np.zeros((n_samples, n_labels), dtype=int)
    
    for i in range(n_samples):
        # Each sample has 1-5 labels
        n_labels_per_sample = np.random.randint(1, 6)
        label_indices = np.random.choice(n_labels, n_labels_per_sample, replace=False)
        labels[i, label_indices] = 1
    
    return labels


@pytest.fixture
def empty_text_samples():
    """Fixture providing edge case: empty or null text samples."""
    return [
        "",
        None,
        "   ",
        "\n\n\n",
        "a",  # Single character
    ]