Spaces:
Sleeping
Sleeping
| """Configuration and constants for the project""" | |
| from pathlib import Path | |
| # Project paths | |
| PROJECT_DIR = Path(__file__).resolve().parents[1] | |
| DATA_DIR = PROJECT_DIR / "data" | |
| RAW_DATA_DIR = DATA_DIR / "raw" | |
| PROCESSED_DATA_DIR = DATA_DIR / "processed" | |
| MODELS_DIR = PROJECT_DIR / "models" | |
| REPORTS_DIR = PROJECT_DIR / "reports" | |
| # Dataset paths | |
| DB_PATH = RAW_DATA_DIR / "skillscope_data.db" | |
| # Data paths configuration for training | |
| # Updated to use cleaned data (duplicates removed, no data leakage) | |
| # Now pointing to TF-IDF features for API compatibility | |
| DATA_PATHS = { | |
| "features": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"), | |
| "labels": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"), | |
| "features_original": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"), | |
| "labels_original": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"), | |
| "models_dir": str(MODELS_DIR), | |
| } | |
| # Embedding configuration | |
| EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" | |
| # API Configuration - which model to use for predictions | |
| API_CONFIG = { | |
| # Model file to load (without path, just filename) | |
| "model_name": "random_forest_tfidf_gridsearch.pkl", | |
| # Feature type: "tfidf" or "embedding" | |
| # This determines how text is transformed before prediction | |
| "feature_type": "tfidf", | |
| } | |
| # Training configuration | |
| TRAINING_CONFIG = { | |
| "random_state": 42, | |
| "test_size": 0.2, | |
| "val_size": 0.1, | |
| "cv_folds": 5, | |
| } | |
| # Model configuration (Random Forest) | |
| MODEL_CONFIG = { | |
| "param_grid": { | |
| "estimator__n_estimators": [50, 100, 200], | |
| "estimator__max_depth": [10, 20, 30], | |
| "estimator__min_samples_split": [2, 5], | |
| } | |
| } | |
| # ADASYN configuration | |
| ADASYN_CONFIG = { | |
| "n_neighbors": 5, | |
| "sampling_strategy": "auto", | |
| } | |
| # PCA configuration | |
| PCA_CONFIG = { | |
| "variance_retained": 0.95, | |
| } | |
| # MLflow configuration | |
| MLFLOW_CONFIG = { | |
| "uri": "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow", | |
| "experiments": { | |
| "baseline": "hopcroft_random_forest_baseline", | |
| "smote": "hopcroft_random_forest_smote", | |
| "ros": "hopcroft_random_forest_ros", | |
| "adasyn_pca": "hopcroft_random_forest_adasyn_pca", | |
| "lightgbm": "hopcroft_lightgbm", | |
| "lightgbm_smote": "hopcroft_lightgbm_smote", | |
| }, | |
| } | |
| # Model parameters (legacy - kept for compatibility) | |
| RANDOM_STATE = 42 | |
| TEST_SIZE = 0.2 | |
| VAL_SIZE = 0.1 | |
| # Feature engineering | |
| MAX_TFIDF_FEATURES = 5000 | |
| NGRAM_RANGE = (1, 2) | |
| # Model training (legacy) | |
| N_ESTIMATORS = 100 | |
| MAX_DEPTH = 20 | |
| # Hugging Face dataset | |
| HF_REPO_ID = "NLBSE/SkillCompetition" | |
| HF_FILENAME = "skillscope_data.zip" | |
| def get_feature_paths(feature_type: str = "embedding", use_cleaned: bool = True) -> dict: | |
| """ | |
| Get data paths for specified feature type. | |
| This function allows easy switching between TF-IDF and Embedding features | |
| for baseline reproduction (TF-IDF) vs improved model (Embeddings). | |
| Args: | |
| feature_type: Type of features - 'tfidf' or 'embedding' | |
| use_cleaned: If True, use cleaned data (duplicates removed, no leakage). | |
| If False, use original processed data. | |
| Returns: | |
| Dictionary with paths to features, labels, and models directory | |
| Example: | |
| # For baseline (paper reproduction) | |
| paths = get_feature_paths(feature_type='tfidf', use_cleaned=True) | |
| # For improved model | |
| paths = get_feature_paths(feature_type='embedding', use_cleaned=True) | |
| """ | |
| if feature_type not in ["tfidf", "embedding"]: | |
| raise ValueError(f"Invalid feature_type: {feature_type}. Must be 'tfidf' or 'embedding'") | |
| feature_dir = PROCESSED_DATA_DIR / feature_type | |
| if use_cleaned: | |
| suffix = "_clean" | |
| else: | |
| suffix = "" | |
| return { | |
| "features": str(feature_dir / f"features_{feature_type}{suffix}.npy"), | |
| "labels": str(feature_dir / f"labels_{feature_type}{suffix}.npy"), | |
| "features_test": str(feature_dir / f"X_test_{feature_type}{suffix}.npy"), | |
| "labels_test": str(feature_dir / f"Y_test_{feature_type}{suffix}.npy"), | |
| "models_dir": str(MODELS_DIR), | |
| "feature_type": feature_type, | |
| } | |