Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 4,144 Bytes

"""Configuration and constants for the project"""

from pathlib import Path

# Project paths
PROJECT_DIR = Path(__file__).resolve().parents[1]
DATA_DIR = PROJECT_DIR / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
MODELS_DIR = PROJECT_DIR / "models"
REPORTS_DIR = PROJECT_DIR / "reports"

# Dataset paths
DB_PATH = RAW_DATA_DIR / "skillscope_data.db"

# Data paths configuration for training
# Updated to use cleaned data (duplicates removed, no data leakage)
# Now pointing to TF-IDF features for API compatibility
DATA_PATHS = {
    "features": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"),
    "labels": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"),
    "features_original": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"),
    "labels_original": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"),
    "models_dir": str(MODELS_DIR),
}

# Embedding configuration
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

# API Configuration - which model to use for predictions
API_CONFIG = {
    # Model file to load (without path, just filename)
    "model_name": "random_forest_tfidf_gridsearch.pkl",
    # Feature type: "tfidf" or "embedding"
    # This determines how text is transformed before prediction
    "feature_type": "tfidf",
}

# Training configuration
TRAINING_CONFIG = {
    "random_state": 42,
    "test_size": 0.2,
    "val_size": 0.1,
    "cv_folds": 5,
}

# Model configuration (Random Forest)
MODEL_CONFIG = {
    "param_grid": {
        "estimator__n_estimators": [50, 100, 200],
        "estimator__max_depth": [10, 20, 30],
        "estimator__min_samples_split": [2, 5],
    }
}

# ADASYN configuration
ADASYN_CONFIG = {
    "n_neighbors": 5,
    "sampling_strategy": "auto",
}

# PCA configuration
PCA_CONFIG = {
    "variance_retained": 0.95,
}

# MLflow configuration
MLFLOW_CONFIG = {
    "uri": "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow",
    "experiments": {
        "baseline": "hopcroft_random_forest_baseline",
        "smote": "hopcroft_random_forest_smote",
        "ros": "hopcroft_random_forest_ros",
        "adasyn_pca": "hopcroft_random_forest_adasyn_pca",
        "lightgbm": "hopcroft_lightgbm",
        "lightgbm_smote": "hopcroft_lightgbm_smote",
    },
}

# Model parameters (legacy - kept for compatibility)
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.1

# Feature engineering
MAX_TFIDF_FEATURES = 5000
NGRAM_RANGE = (1, 2)

# Model training (legacy)
N_ESTIMATORS = 100
MAX_DEPTH = 20

# Hugging Face dataset
HF_REPO_ID = "NLBSE/SkillCompetition"
HF_FILENAME = "skillscope_data.zip"


def get_feature_paths(feature_type: str = "embedding", use_cleaned: bool = True) -> dict:
    """
    Get data paths for specified feature type.

    This function allows easy switching between TF-IDF and Embedding features
    for baseline reproduction (TF-IDF) vs improved model (Embeddings).

    Args:
        feature_type: Type of features - 'tfidf' or 'embedding'
        use_cleaned: If True, use cleaned data (duplicates removed, no leakage).
                     If False, use original processed data.

    Returns:
        Dictionary with paths to features, labels, and models directory

    Example:
        # For baseline (paper reproduction)
        paths = get_feature_paths(feature_type='tfidf', use_cleaned=True)

        # For improved model
        paths = get_feature_paths(feature_type='embedding', use_cleaned=True)
    """
    if feature_type not in ["tfidf", "embedding"]:
        raise ValueError(f"Invalid feature_type: {feature_type}. Must be 'tfidf' or 'embedding'")

    feature_dir = PROCESSED_DATA_DIR / feature_type

    if use_cleaned:
        suffix = "_clean"
    else:
        suffix = ""

    return {
        "features": str(feature_dir / f"features_{feature_type}{suffix}.npy"),
        "labels": str(feature_dir / f"labels_{feature_type}{suffix}.npy"),
        "features_test": str(feature_dir / f"X_test_{feature_type}{suffix}.npy"),
        "labels_test": str(feature_dir / f"Y_test_{feature_type}{suffix}.npy"),
        "models_dir": str(MODELS_DIR),
        "feature_type": feature_type,
    }