Hopcroft-Skill-Classification

Sleeping

Hopcroft-Skill-Classification / hopcroft_skill_classification_tool_competition /features.py

DaCrow13

Deploy to HF Spaces (Clean)

39d224b about 2 months ago

15.6 kB

	"""
	Feature extraction module for skill classification.

	This module provides functions to extract features from the SkillScope dataset,
	starting with TF-IDF vectorization of textual data from pull request issues.

	Dataset Information (from nlbse_tool_competition_data_by_issue):
	- 7,154 issues from 11 Java repositories
	- 226 total columns:
	- 2 text columns: 'issue text' (title) and 'issue description' (body)
	- metadata and other columns containing PR/file/context information
	- 217 label columns: domain/subdomain skill labels (142 active labels in this DB)

	Label Characteristics:
	- Multi-label classification problem
	- Average 32.9 labels per issue (median: 31)
	- Highly imbalanced: some labels appear in all issues, others in very few
	- Top labels: Language, Data Structure, DevOps, Error Handling
	"""

	from pathlib import Path
	import re
	import sqlite3
	from typing import Optional, Tuple

	import joblib

	# Import per lo Stemming
	from nltk.stem import PorterStemmer
	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer

	from hopcroft_skill_classification_tool_competition.config import (
	MODELS_DIR,
	PROCESSED_DATA_DIR,
	RAW_DATA_DIR,
	)

	# Inizializza lo stemmer una volta per efficienza
	stemmer = PorterStemmer()


	def clean_github_text(text: str, use_stemming: bool = True) -> str:
	"""
	Clean GitHub issue text as per SkillScope paper (Aracena et al. process).
	Removes emojis, URLs, HTML tags, and other noise commonly found in GitHub text.
	Optionally applies stemming.

	Args:
	text: Raw text from GitHub issue
	use_stemming: If True, apply Porter stemming (recommended for TF-IDF).
	If False, keep original words (recommended for Embeddings/LLMs).

	Returns:
	Cleaned text string (stemmed if use_stemming=True)
	"""
	if pd.isna(text) or text is None:
	return ""

	text = str(text)

	# Remove URLs (http/httpss/www)
	text = re.sub(r"http\S+\|www\.\S+", "", text)

	# Remove HTML tags
	text = re.sub(r"<[^>]+>", "", text)

	# Remove markdown code blocks
	text = re.sub(r"```[\s\S]*?```", "", text)

	# Remove inline code
	text = re.sub(r"`[^`]*`", "", text)

	# Remove emojis and non-ASCII characters
	text = text.encode("ascii", "ignore").decode("ascii")

	# Remove extra whitespace
	text = re.sub(r"\s+", " ", text)

	text = text.strip()

	# Stemming condizionale: solo per TF-IDF, non per Embeddings
	if use_stemming:
	try:
	tokens = text.split()
	stemmed_tokens = [stemmer.stem(token) for token in tokens]
	text = " ".join(stemmed_tokens)
	except Exception as e:
	print(f"Warning: Stemming failed for text snippet '{text[:50]}...'. Error: {e}")
	# Ritorna il testo pulito ma non stemmato in caso di errore
	return text.strip()

	return text


	def get_dataset_info(df: pd.DataFrame) -> dict:
	"""
	Get summary information about the dataset.

	Args:
	df: Input dataframe

	Returns:
	Dictionary containing dataset statistics
	"""
	text_cols = get_text_columns(df)
	label_cols = get_label_columns(df)

	# Convert to binary labels
	binary_labels = (df[label_cols] > 0).astype(int)
	labels_per_issue = binary_labels.sum(axis=1)
	issues_per_label = binary_labels.sum(axis=0)

	info = {
	"total_issues": len(df),
	"total_columns": len(df.columns),
	"text_columns": text_cols,
	"num_text_columns": len(text_cols),
	"label_columns": label_cols,
	"num_labels": len(label_cols),
	"avg_labels_per_issue": labels_per_issue.mean(),
	"median_labels_per_issue": labels_per_issue.median(),
	"max_labels_per_issue": labels_per_issue.max(),
	"min_labels_per_issue": labels_per_issue.min(),
	"avg_issues_per_label": issues_per_label.mean(),
	"labels_with_no_issues": (issues_per_label == 0).sum(),
	}

	return info


	def load_data_from_db(db_path: Optional[Path] = None) -> pd.DataFrame:
	"""
	Load data from the SQLite database.

	Args:
	db_path: Path to the SQLite database file.
	If None, uses default path in data/raw/skillscope_data.db

	Returns:
	DataFrame containing the nlbse_tool_competition_data_by_issue table
	"""
	if db_path is None:
	db_path = RAW_DATA_DIR / "skillscope_data.db"

	conn = sqlite3.connect(db_path)

	# Load the main table
	query = "SELECT * FROM nlbse_tool_competition_data_by_issue"
	df = pd.read_sql_query(query, conn)

	conn.close()

	print(f"Loaded {len(df)} records from database")
	return df


	def get_text_columns(df: pd.DataFrame) -> list:
	"""
	Identify text columns in the dataframe (typically issue title, body, etc.).

	Args:
	df: Input dataframe

	Returns:
	List of column names containing textual data
	"""
	# Text columns from SkillScope database schema
	# Based on exploration: issue text (title) and issue description (body)
	text_cols = ["issue text", "issue description"]

	return [col for col in text_cols if col in df.columns]


	def get_label_columns(df: pd.DataFrame) -> list:
	"""
	Identify label columns (domains/subdomains with API counts).

	Args:
	df: Input dataframe

	Returns:
	List of column names containing labels
	"""
	# Metadata columns to exclude from labels
	# Based on exploration: these are not skill labels
	exclude_cols = [
	"Repo Name",
	"PR #",
	"issue text",
	"issue description",
	"created_at",
	"author_name",
	]

	# Label columns are numeric but not metadata. Use pandas is_numeric_dtype
	# to be robust to dtype representations.
	from pandas.api.types import is_numeric_dtype

	label_cols = [
	col for col in df.columns if col not in exclude_cols and is_numeric_dtype(df[col])
	]

	return label_cols


	def combine_text_fields(
	df: pd.DataFrame, text_columns: list, use_stemming: bool = True
	) -> pd.Series:
	"""
	Combine multiple text fields into a single text representation.
	Applies text cleaning as per SkillScope paper.

	Args:
	df: Input dataframe
	text_columns: List of column names to combine
	use_stemming: If True, apply stemming (for TF-IDF). If False, keep original words (for Embeddings).

	Returns:
	Series containing cleaned and combined text for each row
	"""
	# Apply cleaning to each text column and then combine
	combined_text = (
	df[text_columns]
	.fillna("")
	.astype(str)
	.apply(
	lambda x: " ".join(
	x.map(lambda text: clean_github_text(text, use_stemming=use_stemming))
	),
	axis=1,
	)
	)
	return combined_text


	def extract_tfidf_features(
	df: pd.DataFrame,
	text_columns: Optional[list] = None,
	max_features: Optional[int] = 2000,
	min_df: int = 2,
	max_df: float = 0.95,
	ngram_range: Tuple[int, int] = (1, 2),
	) -> Tuple[np.ndarray, TfidfVectorizer]:
	"""
	Extract TF-IDF features from textual data.

	Args:
	df: Input dataframe
	text_columns: List of text columns to use. If None, auto-detect.
	max_features: Maximum number of features to extract (default: 2000 for balanced sparsity)
	min_df: Minimum document frequency for a term to be included
	max_df: Maximum document frequency (ignore terms appearing in >max_df of docs)
	ngram_range: Range of n-grams to consider (e.g., (1,2) for unigrams and bigrams)

	Returns:
	Tuple of (feature matrix, fitted vectorizer)
	"""
	if text_columns is None:
	text_columns = get_text_columns(df)

	if not text_columns:
	raise ValueError("No text columns found in dataframe")

	# Combine text fields (with stemming for TF-IDF)
	print(f"Combining text from columns: {text_columns}")
	combined_text = combine_text_fields(df, text_columns, use_stemming=True)

	# Initialize TF-IDF vectorizer
	vectorizer = TfidfVectorizer(
	max_features=max_features,
	min_df=min_df,
	max_df=max_df,
	ngram_range=ngram_range,
	stop_words="english",
	lowercase=True,
	strip_accents="unicode",
	)

	# Fit and transform
	print(
	f"Extracting TF-IDF features with max_features={max_features if max_features else 'All'}, "
	f"ngram_range={ngram_range}"
	)
	tfidf_matrix = vectorizer.fit_transform(combined_text)

	print(
	f"Extracted {tfidf_matrix.shape[1]} TF-IDF features from {tfidf_matrix.shape[0]} samples"
	)

	return tfidf_matrix.toarray(), vectorizer


	def extract_embedding_features(
	df: pd.DataFrame,
	text_columns: Optional[list] = None,
	model_name: str = "all-MiniLM-L6-v2",
	batch_size: int = 32,
	) -> Tuple[np.ndarray, object]:
	"""
	Extract LLM embeddings from textual data using Sentence Transformers.

	Args:
	df: Input dataframe
	text_columns: List of text columns to use. If None, auto-detect.
	model_name: Name of the pre-trained model to use
	batch_size: Batch size for encoding

	Returns:
	Tuple of (feature matrix, model object)
	"""
	try:
	from sentence_transformers import SentenceTransformer
	except ImportError as e:
	raise ImportError(
	f"sentence-transformers import failed: {e}. Try running: pip install sentence-transformers"
	) from e

	if text_columns is None:
	text_columns = get_text_columns(df)

	if not text_columns:
	raise ValueError("No text columns found in dataframe")

	# Combine text fields (without stemming for embeddings - LLMs need full words)
	print(f"Combining text from columns: {text_columns}")
	combined_text = combine_text_fields(df, text_columns, use_stemming=False)

	# Load model
	print(f"Loading embedding model: {model_name}")
	model = SentenceTransformer(model_name)

	# Encode
	print(f"Extracting embeddings for {len(combined_text)} samples...")
	embeddings = model.encode(
	combined_text.tolist(),
	batch_size=batch_size,
	show_progress_bar=True,
	convert_to_numpy=True,
	)

	print(f"Extracted embeddings shape: {embeddings.shape}")

	return embeddings, model


	def prepare_labels(df: pd.DataFrame, label_columns: Optional[list] = None) -> pd.DataFrame:
	"""
	Prepare multi-label binary matrix from label columns.

	Args:
	df: Input dataframe
	label_columns: List of label columns. If None, auto-detect.

	Returns:
	DataFrame with binary labels (1 if label present, 0 otherwise)
	"""
	if label_columns is None:
	label_columns = get_label_columns(df)

	# Convert to binary: any value > 0 means label is present
	labels = (df[label_columns] > 0).astype(int)

	print(f"Prepared {len(label_columns)} labels")
	print(f"Label distribution:\n{labels.sum().describe()}")

	return labels


	def create_feature_dataset(
	db_path: Optional[Path] = None,
	save_processed: bool = True,
	feature_type: str = "tfidf", # 'tfidf' or 'embedding'
	model_name: str = "all-MiniLM-L6-v2",
	) -> Tuple[np.ndarray, pd.DataFrame, list, list]:
	"""
	Main function to create the complete feature dataset.

	Args:
	db_path: Path to SQLite database
	save_processed: Whether to save processed data to disk
	feature_type: Type of features to extract ('tfidf' or 'embedding')
	model_name: Model name for embeddings (ignored if feature_type='tfidf')

	Returns:
	Tuple of (features, labels, feature_names, label_names)
	"""
	# Load data
	df = load_data_from_db(db_path)

	# Get dataset info
	info = get_dataset_info(df)
	print("\n=== Dataset Information ===")
	print(f"Total issues: {info['total_issues']:,}")
	print(f"Text columns: {info['text_columns']}")
	print(f"Number of labels: {info['num_labels']}")
	print(f"Avg labels per issue: {info['avg_labels_per_issue']:.2f}")
	print(f"Labels with no issues: {info['labels_with_no_issues']}")

	# Extract features
	text_columns = get_text_columns(df)
	label_columns = get_label_columns(df)

	feature_names = []

	vectorizer = None

	if feature_type == "tfidf":
	features, vectorizer = extract_tfidf_features(df, text_columns=text_columns)
	feature_names = vectorizer.get_feature_names_out()
	elif feature_type == "embedding":
	features, _ = extract_embedding_features(
	df, text_columns=text_columns, model_name=model_name
	)
	feature_names = [f"emb_{i}" for i in range(features.shape[1])]
	else:
	raise ValueError(f"Unknown feature_type: {feature_type}")

	# Prepare labels
	labels = prepare_labels(df, label_columns)

	# Save processed data
	if save_processed:
	# Path: processed/{feature_type}/
	output_dir = PROCESSED_DATA_DIR / feature_type
	output_dir.mkdir(parents=True, exist_ok=True)

	features_path = output_dir / f"features_{feature_type}.npy"
	labels_path = output_dir / f"labels_{feature_type}.npy"

	np.save(features_path, features)
	np.save(labels_path, labels.values)

	print(f"\nSaved processed data to {output_dir}")
	print(f" - {features_path.name}: {features.shape}")
	print(f" - {labels_path.name}: {labels.shape}")

	# Save vectorizer and label names to models/ directory for inference
	MODELS_DIR.mkdir(parents=True, exist_ok=True)

	if feature_type == "tfidf" and vectorizer is not None:
	vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
	joblib.dump(vectorizer, vectorizer_path)
	print(f" - Saved TF-IDF vectorizer to: {vectorizer_path}")

	# Always save label names (needed for both tfidf and embedding inference)
	label_names_path = MODELS_DIR / "label_names.pkl"
	joblib.dump(label_columns, label_names_path)
	print(f" - Saved {len(label_columns)} label names to: {label_names_path}")

	return features, labels, feature_names, label_columns


	def load_processed_data(
	feature_name: str = "tfidf", data_dir: Optional[Path] = None
	) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Load processed features and labels from disk.

	Args:
	feature_name: Name prefix of the features to load (e.g., 'tfidf', 'bow', 'embeddings')
	data_dir: Path to processed data directory. If None, uses default.

	Returns:
	Tuple of (features, labels)
	"""
	if data_dir is None:
	data_dir = PROCESSED_DATA_DIR

	features_path = data_dir / f"features_{feature_name}.npy"
	labels_path = data_dir / f"labels_{feature_name}.npy"

	features = np.load(features_path)
	labels = np.load(labels_path)

	print(f"Loaded processed data from {data_dir}")
	print(f" - Feature type: {feature_name}")
	print(f" - Features shape: {features.shape}")
	print(f" - Labels shape: {labels.shape}")

	return features, labels


	if __name__ == "__main__":
	features, labels, feature_names, label_names = create_feature_dataset(feature_type="embedding")

	print("\n=== Feature Extraction Summary ===")
	print(f"Features shape: {features.shape}")
	print(f"Labels shape: {labels.shape}")
	print(f"Number of feature names: {len(feature_names)}")
	print(f"Number of labels: {len(label_names)}")