File size: 15,567 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
"""
Feature extraction module for skill classification.

This module provides functions to extract features from the SkillScope dataset,
starting with TF-IDF vectorization of textual data from pull request issues.

Dataset Information (from nlbse_tool_competition_data_by_issue):
- 7,154 issues from 11 Java repositories
        - 226 total columns:
            - 2 text columns: 'issue text' (title) and 'issue description' (body)
            - metadata and other columns containing PR/file/context information
            - 217 label columns: domain/subdomain skill labels (142 active labels in this DB)

Label Characteristics:
- Multi-label classification problem
- Average 32.9 labels per issue (median: 31)
- Highly imbalanced: some labels appear in all issues, others in very few
- Top labels: Language, Data Structure, DevOps, Error Handling
"""

from pathlib import Path
import re
import sqlite3
from typing import Optional, Tuple

import joblib

# Import per lo Stemming
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from hopcroft_skill_classification_tool_competition.config import (
    MODELS_DIR,
    PROCESSED_DATA_DIR,
    RAW_DATA_DIR,
)

# Inizializza lo stemmer una volta per efficienza
stemmer = PorterStemmer()


def clean_github_text(text: str, use_stemming: bool = True) -> str:
    """
    Clean GitHub issue text as per SkillScope paper (Aracena et al. process).
    Removes emojis, URLs, HTML tags, and other noise commonly found in GitHub text.
    Optionally applies stemming.

    Args:
        text: Raw text from GitHub issue
        use_stemming: If True, apply Porter stemming (recommended for TF-IDF).
                     If False, keep original words (recommended for Embeddings/LLMs).

    Returns:
        Cleaned text string (stemmed if use_stemming=True)
    """
    if pd.isna(text) or text is None:
        return ""

    text = str(text)

    # Remove URLs (http/httpss/www)
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Remove HTML tags
    text = re.sub(r"<[^>]+>", "", text)

    # Remove markdown code blocks
    text = re.sub(r"```[\s\S]*?```", "", text)

    # Remove inline code
    text = re.sub(r"`[^`]*`", "", text)

    # Remove emojis and non-ASCII characters
    text = text.encode("ascii", "ignore").decode("ascii")

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    text = text.strip()

    # Stemming condizionale: solo per TF-IDF, non per Embeddings
    if use_stemming:
        try:
            tokens = text.split()
            stemmed_tokens = [stemmer.stem(token) for token in tokens]
            text = " ".join(stemmed_tokens)
        except Exception as e:
            print(f"Warning: Stemming failed for text snippet '{text[:50]}...'. Error: {e}")
            # Ritorna il testo pulito ma non stemmato in caso di errore
            return text.strip()

    return text


def get_dataset_info(df: pd.DataFrame) -> dict:
    """
    Get summary information about the dataset.

    Args:
        df: Input dataframe

    Returns:
        Dictionary containing dataset statistics
    """
    text_cols = get_text_columns(df)
    label_cols = get_label_columns(df)

    # Convert to binary labels
    binary_labels = (df[label_cols] > 0).astype(int)
    labels_per_issue = binary_labels.sum(axis=1)
    issues_per_label = binary_labels.sum(axis=0)

    info = {
        "total_issues": len(df),
        "total_columns": len(df.columns),
        "text_columns": text_cols,
        "num_text_columns": len(text_cols),
        "label_columns": label_cols,
        "num_labels": len(label_cols),
        "avg_labels_per_issue": labels_per_issue.mean(),
        "median_labels_per_issue": labels_per_issue.median(),
        "max_labels_per_issue": labels_per_issue.max(),
        "min_labels_per_issue": labels_per_issue.min(),
        "avg_issues_per_label": issues_per_label.mean(),
        "labels_with_no_issues": (issues_per_label == 0).sum(),
    }

    return info


def load_data_from_db(db_path: Optional[Path] = None) -> pd.DataFrame:
    """
    Load data from the SQLite database.

    Args:
        db_path: Path to the SQLite database file.
                 If None, uses default path in data/raw/skillscope_data.db

    Returns:
        DataFrame containing the nlbse_tool_competition_data_by_issue table
    """
    if db_path is None:
        db_path = RAW_DATA_DIR / "skillscope_data.db"

    conn = sqlite3.connect(db_path)

    # Load the main table
    query = "SELECT * FROM nlbse_tool_competition_data_by_issue"
    df = pd.read_sql_query(query, conn)

    conn.close()

    print(f"Loaded {len(df)} records from database")
    return df


def get_text_columns(df: pd.DataFrame) -> list:
    """
    Identify text columns in the dataframe (typically issue title, body, etc.).

    Args:
        df: Input dataframe

    Returns:
        List of column names containing textual data
    """
    # Text columns from SkillScope database schema
    # Based on exploration: issue text (title) and issue description (body)
    text_cols = ["issue text", "issue description"]

    return [col for col in text_cols if col in df.columns]


def get_label_columns(df: pd.DataFrame) -> list:
    """
    Identify label columns (domains/subdomains with API counts).

    Args:
        df: Input dataframe

    Returns:
        List of column names containing labels
    """
    # Metadata columns to exclude from labels
    # Based on exploration: these are not skill labels
    exclude_cols = [
        "Repo Name",
        "PR #",
        "issue text",
        "issue description",
        "created_at",
        "author_name",
    ]

    # Label columns are numeric but not metadata. Use pandas is_numeric_dtype
    # to be robust to dtype representations.
    from pandas.api.types import is_numeric_dtype

    label_cols = [
        col for col in df.columns if col not in exclude_cols and is_numeric_dtype(df[col])
    ]

    return label_cols


def combine_text_fields(
    df: pd.DataFrame, text_columns: list, use_stemming: bool = True
) -> pd.Series:
    """
    Combine multiple text fields into a single text representation.
    Applies text cleaning as per SkillScope paper.

    Args:
        df: Input dataframe
        text_columns: List of column names to combine
        use_stemming: If True, apply stemming (for TF-IDF). If False, keep original words (for Embeddings).

    Returns:
        Series containing cleaned and combined text for each row
    """
    # Apply cleaning to each text column and then combine
    combined_text = (
        df[text_columns]
        .fillna("")
        .astype(str)
        .apply(
            lambda x: " ".join(
                x.map(lambda text: clean_github_text(text, use_stemming=use_stemming))
            ),
            axis=1,
        )
    )
    return combined_text


def extract_tfidf_features(
    df: pd.DataFrame,
    text_columns: Optional[list] = None,
    max_features: Optional[int] = 2000,
    min_df: int = 2,
    max_df: float = 0.95,
    ngram_range: Tuple[int, int] = (1, 2),
) -> Tuple[np.ndarray, TfidfVectorizer]:
    """
    Extract TF-IDF features from textual data.

    Args:
        df: Input dataframe
        text_columns: List of text columns to use. If None, auto-detect.
        max_features: Maximum number of features to extract (default: 2000 for balanced sparsity)
        min_df: Minimum document frequency for a term to be included
        max_df: Maximum document frequency (ignore terms appearing in >max_df of docs)
        ngram_range: Range of n-grams to consider (e.g., (1,2) for unigrams and bigrams)

    Returns:
        Tuple of (feature matrix, fitted vectorizer)
    """
    if text_columns is None:
        text_columns = get_text_columns(df)

    if not text_columns:
        raise ValueError("No text columns found in dataframe")

    # Combine text fields (with stemming for TF-IDF)
    print(f"Combining text from columns: {text_columns}")
    combined_text = combine_text_fields(df, text_columns, use_stemming=True)

    # Initialize TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        min_df=min_df,
        max_df=max_df,
        ngram_range=ngram_range,
        stop_words="english",
        lowercase=True,
        strip_accents="unicode",
    )

    # Fit and transform
    print(
        f"Extracting TF-IDF features with max_features={max_features if max_features else 'All'}, "
        f"ngram_range={ngram_range}"
    )
    tfidf_matrix = vectorizer.fit_transform(combined_text)

    print(
        f"Extracted {tfidf_matrix.shape[1]} TF-IDF features from {tfidf_matrix.shape[0]} samples"
    )

    return tfidf_matrix.toarray(), vectorizer


def extract_embedding_features(
    df: pd.DataFrame,
    text_columns: Optional[list] = None,
    model_name: str = "all-MiniLM-L6-v2",
    batch_size: int = 32,
) -> Tuple[np.ndarray, object]:
    """
    Extract LLM embeddings from textual data using Sentence Transformers.

    Args:
        df: Input dataframe
        text_columns: List of text columns to use. If None, auto-detect.
        model_name: Name of the pre-trained model to use
        batch_size: Batch size for encoding

    Returns:
        Tuple of (feature matrix, model object)
    """
    try:
        from sentence_transformers import SentenceTransformer
    except ImportError as e:
        raise ImportError(
            f"sentence-transformers import failed: {e}. Try running: pip install sentence-transformers"
        ) from e

    if text_columns is None:
        text_columns = get_text_columns(df)

    if not text_columns:
        raise ValueError("No text columns found in dataframe")

    # Combine text fields (without stemming for embeddings - LLMs need full words)
    print(f"Combining text from columns: {text_columns}")
    combined_text = combine_text_fields(df, text_columns, use_stemming=False)

    # Load model
    print(f"Loading embedding model: {model_name}")
    model = SentenceTransformer(model_name)

    # Encode
    print(f"Extracting embeddings for {len(combined_text)} samples...")
    embeddings = model.encode(
        combined_text.tolist(),
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
    )

    print(f"Extracted embeddings shape: {embeddings.shape}")

    return embeddings, model


def prepare_labels(df: pd.DataFrame, label_columns: Optional[list] = None) -> pd.DataFrame:
    """
    Prepare multi-label binary matrix from label columns.

    Args:
        df: Input dataframe
        label_columns: List of label columns. If None, auto-detect.

    Returns:
        DataFrame with binary labels (1 if label present, 0 otherwise)
    """
    if label_columns is None:
        label_columns = get_label_columns(df)

    # Convert to binary: any value > 0 means label is present
    labels = (df[label_columns] > 0).astype(int)

    print(f"Prepared {len(label_columns)} labels")
    print(f"Label distribution:\n{labels.sum().describe()}")

    return labels


def create_feature_dataset(
    db_path: Optional[Path] = None,
    save_processed: bool = True,
    feature_type: str = "tfidf",  # 'tfidf' or 'embedding'
    model_name: str = "all-MiniLM-L6-v2",
) -> Tuple[np.ndarray, pd.DataFrame, list, list]:
    """
    Main function to create the complete feature dataset.

    Args:
        db_path: Path to SQLite database
        save_processed: Whether to save processed data to disk
        feature_type: Type of features to extract ('tfidf' or 'embedding')
        model_name: Model name for embeddings (ignored if feature_type='tfidf')

    Returns:
        Tuple of (features, labels, feature_names, label_names)
    """
    # Load data
    df = load_data_from_db(db_path)

    # Get dataset info
    info = get_dataset_info(df)
    print("\n=== Dataset Information ===")
    print(f"Total issues: {info['total_issues']:,}")
    print(f"Text columns: {info['text_columns']}")
    print(f"Number of labels: {info['num_labels']}")
    print(f"Avg labels per issue: {info['avg_labels_per_issue']:.2f}")
    print(f"Labels with no issues: {info['labels_with_no_issues']}")

    # Extract features
    text_columns = get_text_columns(df)
    label_columns = get_label_columns(df)

    feature_names = []

    vectorizer = None

    if feature_type == "tfidf":
        features, vectorizer = extract_tfidf_features(df, text_columns=text_columns)
        feature_names = vectorizer.get_feature_names_out()
    elif feature_type == "embedding":
        features, _ = extract_embedding_features(
            df, text_columns=text_columns, model_name=model_name
        )
        feature_names = [f"emb_{i}" for i in range(features.shape[1])]
    else:
        raise ValueError(f"Unknown feature_type: {feature_type}")

    # Prepare labels
    labels = prepare_labels(df, label_columns)

    # Save processed data
    if save_processed:
        # Path: processed/{feature_type}/
        output_dir = PROCESSED_DATA_DIR / feature_type
        output_dir.mkdir(parents=True, exist_ok=True)

        features_path = output_dir / f"features_{feature_type}.npy"
        labels_path = output_dir / f"labels_{feature_type}.npy"

        np.save(features_path, features)
        np.save(labels_path, labels.values)

        print(f"\nSaved processed data to {output_dir}")
        print(f"  - {features_path.name}: {features.shape}")
        print(f"  - {labels_path.name}: {labels.shape}")

        # Save vectorizer and label names to models/ directory for inference
        MODELS_DIR.mkdir(parents=True, exist_ok=True)

        if feature_type == "tfidf" and vectorizer is not None:
            vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
            joblib.dump(vectorizer, vectorizer_path)
            print(f"  - Saved TF-IDF vectorizer to: {vectorizer_path}")

        # Always save label names (needed for both tfidf and embedding inference)
        label_names_path = MODELS_DIR / "label_names.pkl"
        joblib.dump(label_columns, label_names_path)
        print(f"  - Saved {len(label_columns)} label names to: {label_names_path}")

    return features, labels, feature_names, label_columns


def load_processed_data(
    feature_name: str = "tfidf", data_dir: Optional[Path] = None
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Load processed features and labels from disk.

    Args:
        feature_name: Name prefix of the features to load (e.g., 'tfidf', 'bow', 'embeddings')
        data_dir: Path to processed data directory. If None, uses default.

    Returns:
        Tuple of (features, labels)
    """
    if data_dir is None:
        data_dir = PROCESSED_DATA_DIR

    features_path = data_dir / f"features_{feature_name}.npy"
    labels_path = data_dir / f"labels_{feature_name}.npy"

    features = np.load(features_path)
    labels = np.load(labels_path)

    print(f"Loaded processed data from {data_dir}")
    print(f"  - Feature type: {feature_name}")
    print(f"  - Features shape: {features.shape}")
    print(f"  - Labels shape: {labels.shape}")

    return features, labels


if __name__ == "__main__":
    features, labels, feature_names, label_names = create_feature_dataset(feature_type="embedding")

    print("\n=== Feature Extraction Summary ===")
    print(f"Features shape: {features.shape}")
    print(f"Labels shape: {labels.shape}")
    print(f"Number of feature names: {len(feature_names)}")
    print(f"Number of labels: {len(label_names)}")