antofra10's picture
Port changes from Milestone-5
b72dbd8
"""Configuration and constants for the project"""
from pathlib import Path
# Project paths
PROJECT_DIR = Path(__file__).resolve().parents[1]
DATA_DIR = PROJECT_DIR / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
MODELS_DIR = PROJECT_DIR / "models"
REPORTS_DIR = PROJECT_DIR / "reports"
# Dataset paths
DB_PATH = RAW_DATA_DIR / "skillscope_data.db"
# Data paths configuration for training
# Updated to use cleaned data (duplicates removed, no data leakage)
# Now pointing to TF-IDF features for API compatibility
DATA_PATHS = {
"features": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"),
"labels": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"),
"features_original": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"),
"labels_original": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"),
"models_dir": str(MODELS_DIR),
}
# Embedding configuration
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
# API Configuration - which model to use for predictions
API_CONFIG = {
# Model file to load (without path, just filename)
"model_name": "random_forest_tfidf_gridsearch.pkl",
# Feature type: "tfidf" or "embedding"
# This determines how text is transformed before prediction
"feature_type": "tfidf",
}
# Training configuration
TRAINING_CONFIG = {
"random_state": 42,
"test_size": 0.2,
"val_size": 0.1,
"cv_folds": 5,
}
# Model configuration (Random Forest)
MODEL_CONFIG = {
"param_grid": {
"estimator__n_estimators": [50, 100, 200],
"estimator__max_depth": [10, 20, 30],
"estimator__min_samples_split": [2, 5],
}
}
# ADASYN configuration
ADASYN_CONFIG = {
"n_neighbors": 5,
"sampling_strategy": "auto",
}
# PCA configuration
PCA_CONFIG = {
"variance_retained": 0.95,
}
# MLflow configuration
MLFLOW_CONFIG = {
"uri": "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow",
"experiments": {
"baseline": "hopcroft_random_forest_baseline",
"smote": "hopcroft_random_forest_smote",
"ros": "hopcroft_random_forest_ros",
"adasyn_pca": "hopcroft_random_forest_adasyn_pca",
"lightgbm": "hopcroft_lightgbm",
"lightgbm_smote": "hopcroft_lightgbm_smote",
},
}
# Model parameters (legacy - kept for compatibility)
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.1
# Feature engineering
MAX_TFIDF_FEATURES = 5000
NGRAM_RANGE = (1, 2)
# Model training (legacy)
N_ESTIMATORS = 100
MAX_DEPTH = 20
# Hugging Face dataset
HF_REPO_ID = "NLBSE/SkillCompetition"
HF_FILENAME = "skillscope_data.zip"
def get_feature_paths(feature_type: str = "embedding", use_cleaned: bool = True) -> dict:
"""
Get data paths for specified feature type.
This function allows easy switching between TF-IDF and Embedding features
for baseline reproduction (TF-IDF) vs improved model (Embeddings).
Args:
feature_type: Type of features - 'tfidf' or 'embedding'
use_cleaned: If True, use cleaned data (duplicates removed, no leakage).
If False, use original processed data.
Returns:
Dictionary with paths to features, labels, and models directory
Example:
# For baseline (paper reproduction)
paths = get_feature_paths(feature_type='tfidf', use_cleaned=True)
# For improved model
paths = get_feature_paths(feature_type='embedding', use_cleaned=True)
"""
if feature_type not in ["tfidf", "embedding"]:
raise ValueError(f"Invalid feature_type: {feature_type}. Must be 'tfidf' or 'embedding'")
feature_dir = PROCESSED_DATA_DIR / feature_type
if use_cleaned:
suffix = "_clean"
else:
suffix = ""
return {
"features": str(feature_dir / f"features_{feature_type}{suffix}.npy"),
"labels": str(feature_dir / f"labels_{feature_type}{suffix}.npy"),
"features_test": str(feature_dir / f"X_test_{feature_type}{suffix}.npy"),
"labels_test": str(feature_dir / f"Y_test_{feature_type}{suffix}.npy"),
"models_dir": str(MODELS_DIR),
"feature_type": feature_type,
}