|
|
"""Configuration and constants for the project""" |
|
|
|
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
PROJECT_DIR = Path(__file__).resolve().parents[1] |
|
|
DATA_DIR = PROJECT_DIR / "data" |
|
|
RAW_DATA_DIR = DATA_DIR / "raw" |
|
|
PROCESSED_DATA_DIR = DATA_DIR / "processed" |
|
|
MODELS_DIR = PROJECT_DIR / "models" |
|
|
REPORTS_DIR = PROJECT_DIR / "reports" |
|
|
|
|
|
|
|
|
DB_PATH = RAW_DATA_DIR / "skillscope_data.db" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DATA_PATHS = { |
|
|
"features": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"), |
|
|
"labels": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"), |
|
|
"features_original": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"), |
|
|
"labels_original": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"), |
|
|
"models_dir": str(MODELS_DIR), |
|
|
} |
|
|
|
|
|
|
|
|
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" |
|
|
|
|
|
|
|
|
API_CONFIG = { |
|
|
|
|
|
"model_name": "random_forest_tfidf_gridsearch.pkl", |
|
|
|
|
|
|
|
|
"feature_type": "tfidf", |
|
|
} |
|
|
|
|
|
|
|
|
TRAINING_CONFIG = { |
|
|
"random_state": 42, |
|
|
"test_size": 0.2, |
|
|
"val_size": 0.1, |
|
|
"cv_folds": 5, |
|
|
} |
|
|
|
|
|
|
|
|
MODEL_CONFIG = { |
|
|
"param_grid": { |
|
|
"estimator__n_estimators": [50, 100, 200], |
|
|
"estimator__max_depth": [10, 20, 30], |
|
|
"estimator__min_samples_split": [2, 5], |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
ADASYN_CONFIG = { |
|
|
"n_neighbors": 5, |
|
|
"sampling_strategy": "auto", |
|
|
} |
|
|
|
|
|
|
|
|
PCA_CONFIG = { |
|
|
"variance_retained": 0.95, |
|
|
} |
|
|
|
|
|
|
|
|
MLFLOW_CONFIG = { |
|
|
"uri": "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow", |
|
|
"experiments": { |
|
|
"baseline": "hopcroft_random_forest_baseline", |
|
|
"smote": "hopcroft_random_forest_smote", |
|
|
"ros": "hopcroft_random_forest_ros", |
|
|
"adasyn_pca": "hopcroft_random_forest_adasyn_pca", |
|
|
"lightgbm": "hopcroft_lightgbm", |
|
|
"lightgbm_smote": "hopcroft_lightgbm_smote", |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
RANDOM_STATE = 42 |
|
|
TEST_SIZE = 0.2 |
|
|
VAL_SIZE = 0.1 |
|
|
|
|
|
|
|
|
MAX_TFIDF_FEATURES = 5000 |
|
|
NGRAM_RANGE = (1, 2) |
|
|
|
|
|
|
|
|
N_ESTIMATORS = 100 |
|
|
MAX_DEPTH = 20 |
|
|
|
|
|
|
|
|
HF_REPO_ID = "NLBSE/SkillCompetition" |
|
|
HF_FILENAME = "skillscope_data.zip" |
|
|
|
|
|
|
|
|
def get_feature_paths(feature_type: str = "embedding", use_cleaned: bool = True) -> dict: |
|
|
""" |
|
|
Get data paths for specified feature type. |
|
|
|
|
|
This function allows easy switching between TF-IDF and Embedding features |
|
|
for baseline reproduction (TF-IDF) vs improved model (Embeddings). |
|
|
|
|
|
Args: |
|
|
feature_type: Type of features - 'tfidf' or 'embedding' |
|
|
use_cleaned: If True, use cleaned data (duplicates removed, no leakage). |
|
|
If False, use original processed data. |
|
|
|
|
|
Returns: |
|
|
Dictionary with paths to features, labels, and models directory |
|
|
|
|
|
Example: |
|
|
# For baseline (paper reproduction) |
|
|
paths = get_feature_paths(feature_type='tfidf', use_cleaned=True) |
|
|
|
|
|
# For improved model |
|
|
paths = get_feature_paths(feature_type='embedding', use_cleaned=True) |
|
|
""" |
|
|
if feature_type not in ["tfidf", "embedding"]: |
|
|
raise ValueError(f"Invalid feature_type: {feature_type}. Must be 'tfidf' or 'embedding'") |
|
|
|
|
|
feature_dir = PROCESSED_DATA_DIR / feature_type |
|
|
|
|
|
if use_cleaned: |
|
|
suffix = "_clean" |
|
|
else: |
|
|
suffix = "" |
|
|
|
|
|
return { |
|
|
"features": str(feature_dir / f"features_{feature_type}{suffix}.npy"), |
|
|
"labels": str(feature_dir / f"labels_{feature_type}{suffix}.npy"), |
|
|
"features_test": str(feature_dir / f"X_test_{feature_type}{suffix}.npy"), |
|
|
"labels_test": str(feature_dir / f"Y_test_{feature_type}{suffix}.npy"), |
|
|
"models_dir": str(MODELS_DIR), |
|
|
"feature_type": feature_type, |
|
|
} |
|
|
|