"""Configuration and constants for the project""" from pathlib import Path # Project paths PROJECT_DIR = Path(__file__).resolve().parents[1] DATA_DIR = PROJECT_DIR / "data" RAW_DATA_DIR = DATA_DIR / "raw" PROCESSED_DATA_DIR = DATA_DIR / "processed" MODELS_DIR = PROJECT_DIR / "models" REPORTS_DIR = PROJECT_DIR / "reports" # Dataset paths DB_PATH = RAW_DATA_DIR / "skillscope_data.db" # Data paths configuration for training # Updated to use cleaned data (duplicates removed, no data leakage) # Now pointing to TF-IDF features for API compatibility DATA_PATHS = { "features": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"), "labels": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"), "features_original": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"), "labels_original": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"), "models_dir": str(MODELS_DIR), } # Embedding configuration EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # API Configuration - which model to use for predictions API_CONFIG = { # Model file to load (without path, just filename) "model_name": "random_forest_tfidf_gridsearch.pkl", # Feature type: "tfidf" or "embedding" # This determines how text is transformed before prediction "feature_type": "tfidf", } # Training configuration TRAINING_CONFIG = { "random_state": 42, "test_size": 0.2, "val_size": 0.1, "cv_folds": 5, } # Model configuration (Random Forest) MODEL_CONFIG = { "param_grid": { "estimator__n_estimators": [50, 100, 200], "estimator__max_depth": [10, 20, 30], "estimator__min_samples_split": [2, 5], } } # ADASYN configuration ADASYN_CONFIG = { "n_neighbors": 5, "sampling_strategy": "auto", } # PCA configuration PCA_CONFIG = { "variance_retained": 0.95, } # MLflow configuration MLFLOW_CONFIG = { "uri": "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow", "experiments": { "baseline": "hopcroft_random_forest_baseline", "smote": "hopcroft_random_forest_smote", "ros": "hopcroft_random_forest_ros", "adasyn_pca": "hopcroft_random_forest_adasyn_pca", "lightgbm": "hopcroft_lightgbm", "lightgbm_smote": "hopcroft_lightgbm_smote", }, } # Model parameters (legacy - kept for compatibility) RANDOM_STATE = 42 TEST_SIZE = 0.2 VAL_SIZE = 0.1 # Feature engineering MAX_TFIDF_FEATURES = 5000 NGRAM_RANGE = (1, 2) # Model training (legacy) N_ESTIMATORS = 100 MAX_DEPTH = 20 # Hugging Face dataset HF_REPO_ID = "NLBSE/SkillCompetition" HF_FILENAME = "skillscope_data.zip" def get_feature_paths(feature_type: str = "embedding", use_cleaned: bool = True) -> dict: """ Get data paths for specified feature type. This function allows easy switching between TF-IDF and Embedding features for baseline reproduction (TF-IDF) vs improved model (Embeddings). Args: feature_type: Type of features - 'tfidf' or 'embedding' use_cleaned: If True, use cleaned data (duplicates removed, no leakage). If False, use original processed data. Returns: Dictionary with paths to features, labels, and models directory Example: # For baseline (paper reproduction) paths = get_feature_paths(feature_type='tfidf', use_cleaned=True) # For improved model paths = get_feature_paths(feature_type='embedding', use_cleaned=True) """ if feature_type not in ["tfidf", "embedding"]: raise ValueError(f"Invalid feature_type: {feature_type}. Must be 'tfidf' or 'embedding'") feature_dir = PROCESSED_DATA_DIR / feature_type if use_cleaned: suffix = "_clean" else: suffix = "" return { "features": str(feature_dir / f"features_{feature_type}{suffix}.npy"), "labels": str(feature_dir / f"labels_{feature_type}{suffix}.npy"), "features_test": str(feature_dir / f"X_test_{feature_type}{suffix}.npy"), "labels_test": str(feature_dir / f"Y_test_{feature_type}{suffix}.npy"), "models_dir": str(MODELS_DIR), "feature_type": feature_type, }