"""Project configuration, paths, constants, and logging setup. This module centralizes project-wide configuration such as directory paths, dataset settings, seeds, and model registries. It also handles environment variable loading from a local `.env` file (if present) and configures logging to play nicely with progress bars. Key attributes -------------- - `PROJ_ROOT`: Absolute path to the project root. - `DATA_DIR`, `RAW_DATA_DIR`, `INTERIM_DATA_DIR`, `PROCESSED_DATA_DIR`, `EXTERNAL_DATA_DIR`: Standard data directories following the Cookiecutter data science layout. - `MODELS_DIR`: Directory where trained models and artifacts are stored. - `REPORTS_DIR`, `FIGURES_DIR`: Reporting outputs and figures. - `DATASET_NAME`: Hugging Face dataset identifier for this project. - `LANGUAGES`, `SPLITS`, `LANGUAGES_SPLITS`: Supported languages and train/test splits. - `FEATURE_TYPES`: Supported feature families (e.g., embeddings). - `LABELS`: Per-language label names in the dataset. - `OPTUNA_CROSS_VALIDATION_FOLDS`, `OPTUNA_TRIALS`: Tuning configuration. - `SEED`: Global random seed. - `SENTENCE_TRANSFORMER_MODELS`: Mapping of local names to HF model IDs. """ from pathlib import Path from dotenv import load_dotenv from loguru import logger # Load environment variables from .env file if it exists load_dotenv() # Paths PROJ_ROOT = Path(__file__).resolve().parents[1] logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}") DATA_DIR = PROJ_ROOT / "data" RAW_DATA_DIR = DATA_DIR / "raw" INTERIM_DATA_DIR = DATA_DIR / "interim" PROCESSED_DATA_DIR = DATA_DIR / "processed" EXTERNAL_DATA_DIR = DATA_DIR / "external" MODELS_DIR = PROJ_ROOT / "models" REPORTS_DIR = PROJ_ROOT / "reports" FIGURES_DIR = REPORTS_DIR / "figures" # Defined by us DATASET_NAME = "NLBSE/nlbse26-code-comment-classification" LANGUAGES = ["java", "python", "pharo"] SPLITS = ["train", "test"] LANGUAGES_SPLITS = [f"{lang}_{split}" for lang in LANGUAGES for split in SPLITS] FEATURE_TYPES = ["embeddings"] LABELS = { "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"], "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"], "pharo": [ "Keyimplementationpoints", "Example", "Responsibilities", "Intent", "Keymessages", "Collaborators", ], } OPTUNA_CROSS_VALIDATION_FOLDS = 2 OPTUNA_TRIALS = 1 SEED = 42 SENTENCE_TRANSFORMER_MODELS = { "paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2", } # If tqdm is installed, configure loguru with tqdm.write # https://github.com/Delgan/loguru/issues/135 try: from tqdm import tqdm logger.remove(0) logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) except ModuleNotFoundError: pass