Hopcroft-Skill-Classification

Sleeping

App Files Files Community

Hopcroft-Skill-Classification / hopcroft_skill_classification_tool_competition /config.py

antofra10

Port changes from Milestone-5

013adab 3 months ago

raw

history blame contribute delete

4.14 kB

	"""Configuration and constants for the project"""

	from pathlib import Path

	# Project paths
	PROJECT_DIR = Path(__file__).resolve().parents[1]
	DATA_DIR = PROJECT_DIR / "data"
	RAW_DATA_DIR = DATA_DIR / "raw"
	PROCESSED_DATA_DIR = DATA_DIR / "processed"
	MODELS_DIR = PROJECT_DIR / "models"
	REPORTS_DIR = PROJECT_DIR / "reports"

	# Dataset paths
	DB_PATH = RAW_DATA_DIR / "skillscope_data.db"

	# Data paths configuration for training
	# Updated to use cleaned data (duplicates removed, no data leakage)
	# Now pointing to TF-IDF features for API compatibility
	DATA_PATHS = {
	"features": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"),
	"labels": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"),
	"features_original": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"),
	"labels_original": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"),
	"models_dir": str(MODELS_DIR),
	}

	# Embedding configuration
	EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

	# API Configuration - which model to use for predictions
	API_CONFIG = {
	# Model file to load (without path, just filename)
	"model_name": "random_forest_tfidf_gridsearch.pkl",
	# Feature type: "tfidf" or "embedding"
	# This determines how text is transformed before prediction
	"feature_type": "tfidf",
	}

	# Training configuration
	TRAINING_CONFIG = {
	"random_state": 42,
	"test_size": 0.2,
	"val_size": 0.1,
	"cv_folds": 5,
	}

	# Model configuration (Random Forest)
	MODEL_CONFIG = {
	"param_grid": {
	"estimator__n_estimators": [50, 100, 200],
	"estimator__max_depth": [10, 20, 30],
	"estimator__min_samples_split": [2, 5],
	}
	}

	# ADASYN configuration
	ADASYN_CONFIG = {
	"n_neighbors": 5,
	"sampling_strategy": "auto",
	}

	# PCA configuration
	PCA_CONFIG = {
	"variance_retained": 0.95,
	}

	# MLflow configuration
	MLFLOW_CONFIG = {
	"uri": "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow",
	"experiments": {
	"baseline": "hopcroft_random_forest_baseline",
	"smote": "hopcroft_random_forest_smote",
	"ros": "hopcroft_random_forest_ros",
	"adasyn_pca": "hopcroft_random_forest_adasyn_pca",
	"lightgbm": "hopcroft_lightgbm",
	"lightgbm_smote": "hopcroft_lightgbm_smote",
	},
	}

	# Model parameters (legacy - kept for compatibility)
	RANDOM_STATE = 42
	TEST_SIZE = 0.2
	VAL_SIZE = 0.1

	# Feature engineering
	MAX_TFIDF_FEATURES = 5000
	NGRAM_RANGE = (1, 2)

	# Model training (legacy)
	N_ESTIMATORS = 100
	MAX_DEPTH = 20

	# Hugging Face dataset
	HF_REPO_ID = "NLBSE/SkillCompetition"
	HF_FILENAME = "skillscope_data.zip"


	def get_feature_paths(feature_type: str = "embedding", use_cleaned: bool = True) -> dict:
	"""
	Get data paths for specified feature type.

	This function allows easy switching between TF-IDF and Embedding features
	for baseline reproduction (TF-IDF) vs improved model (Embeddings).

	Args:
	feature_type: Type of features - 'tfidf' or 'embedding'
	use_cleaned: If True, use cleaned data (duplicates removed, no leakage).
	If False, use original processed data.

	Returns:
	Dictionary with paths to features, labels, and models directory

	Example:
	# For baseline (paper reproduction)
	paths = get_feature_paths(feature_type='tfidf', use_cleaned=True)

	# For improved model
	paths = get_feature_paths(feature_type='embedding', use_cleaned=True)
	"""
	if feature_type not in ["tfidf", "embedding"]:
	raise ValueError(f"Invalid feature_type: {feature_type}. Must be 'tfidf' or 'embedding'")

	feature_dir = PROCESSED_DATA_DIR / feature_type

	if use_cleaned:
	suffix = "_clean"
	else:
	suffix = ""

	return {
	"features": str(feature_dir / f"features_{feature_type}{suffix}.npy"),
	"labels": str(feature_dir / f"labels_{feature_type}{suffix}.npy"),
	"features_test": str(feature_dir / f"X_test_{feature_type}{suffix}.npy"),
	"labels_test": str(feature_dir / f"Y_test_{feature_type}{suffix}.npy"),
	"models_dir": str(MODELS_DIR),
	"feature_type": feature_type,
	}