File size: 4,328 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Root pytest configuration and shared fixtures.

This module provides fixtures that are available to all test modules.
"""
import pytest
import numpy as np
import pandas as pd
import tempfile
import sqlite3
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer


@pytest.fixture
def sample_text_data():
    """Fixture providing sample text data for testing."""
    return [
        "Fixed bug in authentication system using OAuth2",
        "Implemented REST API endpoint for user data retrieval",
        "Added unit tests for data processing pipeline",
        "Refactored code to improve performance and reduce memory usage",
        "Updated database schema with new migration scripts",
    ]


@pytest.fixture
def sample_dirty_text():
    """Fixture providing text with common GitHub noise."""
    return [
        "Fixed bug https://github.com/repo/issues/123 in auth system",
        "Added feature with <b>HTML tags</b> and `inline code`",
        "Removed emoji 😀 and special characters",
        """Updated docs with code block:
        ```python
        def foo():
            pass
        ```
        """,
        "Fixed    multiple   spaces   and\n\nnewlines",
    ]


@pytest.fixture
def sample_labels():
    """Fixture providing sample multi-label data."""
    return pd.DataFrame({
        'Language': [1, 1, 1, 0, 1],
        'Data Structure': [1, 0, 0, 1, 1],
        'Testing': [0, 0, 1, 0, 0],
        'API': [1, 1, 0, 0, 0],
        'DevOps': [0, 0, 0, 1, 1],
    })


@pytest.fixture
def sample_dataframe(sample_text_data, sample_labels):
    """Fixture providing complete sample dataframe."""
    df = pd.DataFrame({
        'Repo Name': ['repo1', 'repo2', 'repo1', 'repo3', 'repo2'],
        'PR #': [1, 2, 3, 4, 5],
        'issue text': [sample_text_data[0], sample_text_data[1], 
                       sample_text_data[2], sample_text_data[3], 
                       sample_text_data[4]],
        'issue description': ['Description for issue 1', 'Description for issue 2',
                              'Description for issue 3', 'Description for issue 4',
                              'Description for issue 5'],
    })
    
    # Add label columns
    for col in sample_labels.columns:
        df[col] = sample_labels[col].values
    
    return df


@pytest.fixture
def temp_db(sample_dataframe):
    """Fixture providing temporary SQLite database."""
    with tempfile.NamedTemporaryFile(mode='w', suffix='.db', delete=False) as f:
        db_path = f.name
    
    # Create database and insert data
    conn = sqlite3.connect(db_path)
    sample_dataframe.to_sql('nlbse_tool_competition_data_by_issue', 
                            conn, if_exists='replace', index=False)
    conn.close()
    
    yield Path(db_path)
    
    # Cleanup
    Path(db_path).unlink()


@pytest.fixture
def sample_tfidf_vectorizer():
    """Fixture providing a simple TF-IDF vectorizer."""
    vectorizer = TfidfVectorizer(
        max_features=100,
        ngram_range=(1, 2),
        stop_words='english'
    )
    return vectorizer


@pytest.fixture
def sample_sparse_features():
    """Fixture providing sample sparse feature matrix."""
    # Create a sparse matrix (mostly zeros)
    features = np.zeros((100, 50))
    
    # Add some non-zero values
    for i in range(100):
        # Each row has 5-10 non-zero features
        n_nonzero = np.random.randint(5, 11)
        indices = np.random.choice(50, n_nonzero, replace=False)
        features[i, indices] = np.random.rand(n_nonzero)
    
    return features


@pytest.fixture
def sample_multilabel_data():
    """Fixture providing sample multi-label classification data."""
    n_samples = 100
    n_labels = 10
    
    # Generate labels with varying frequencies
    labels = np.zeros((n_samples, n_labels), dtype=int)
    
    for i in range(n_samples):
        # Each sample has 1-5 labels
        n_labels_per_sample = np.random.randint(1, 6)
        label_indices = np.random.choice(n_labels, n_labels_per_sample, replace=False)
        labels[i, label_indices] = 1
    
    return labels


@pytest.fixture
def empty_text_samples():
    """Fixture providing edge case: empty or null text samples."""
    return [
        "",
        None,
        "   ",
        "\n\n\n",
        "a",  # Single character
    ]