| | """Project configuration, paths, constants, and logging setup. |
| | |
| | This module centralizes project-wide configuration such as directory paths, |
| | dataset settings, seeds, and model registries. It also handles environment |
| | variable loading from a local `.env` file (if present) and configures logging |
| | to play nicely with progress bars. |
| | |
| | Key attributes |
| | -------------- |
| | - `PROJ_ROOT`: Absolute path to the project root. |
| | - `DATA_DIR`, `RAW_DATA_DIR`, `INTERIM_DATA_DIR`, `PROCESSED_DATA_DIR`, `EXTERNAL_DATA_DIR`: |
| | Standard data directories following the Cookiecutter data science layout. |
| | - `MODELS_DIR`: Directory where trained models and artifacts are stored. |
| | - `REPORTS_DIR`, `FIGURES_DIR`: Reporting outputs and figures. |
| | - `DATASET_NAME`: Hugging Face dataset identifier for this project. |
| | - `LANGUAGES`, `SPLITS`, `LANGUAGES_SPLITS`: Supported languages and train/test splits. |
| | - `FEATURE_TYPES`: Supported feature families (e.g., embeddings). |
| | - `LABELS`: Per-language label names in the dataset. |
| | - `OPTUNA_CROSS_VALIDATION_FOLDS`, `OPTUNA_TRIALS`: Tuning configuration. |
| | - `SEED`: Global random seed. |
| | - `SENTENCE_TRANSFORMER_MODELS`: Mapping of local names to HF model IDs. |
| | """ |
| |
|
| | from pathlib import Path |
| |
|
| | from dotenv import load_dotenv |
| | from loguru import logger |
| |
|
| | |
| | load_dotenv() |
| |
|
| | |
| | PROJ_ROOT = Path(__file__).resolve().parents[1] |
| | logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}") |
| |
|
| | DATA_DIR = PROJ_ROOT / "data" |
| | RAW_DATA_DIR = DATA_DIR / "raw" |
| | INTERIM_DATA_DIR = DATA_DIR / "interim" |
| | PROCESSED_DATA_DIR = DATA_DIR / "processed" |
| | EXTERNAL_DATA_DIR = DATA_DIR / "external" |
| |
|
| | MODELS_DIR = PROJ_ROOT / "models" |
| |
|
| | REPORTS_DIR = PROJ_ROOT / "reports" |
| | FIGURES_DIR = REPORTS_DIR / "figures" |
| |
|
| | |
| | DATASET_NAME = "NLBSE/nlbse26-code-comment-classification" |
| | LANGUAGES = ["java", "python", "pharo"] |
| | SPLITS = ["train", "test"] |
| | LANGUAGES_SPLITS = [f"{lang}_{split}" for lang in LANGUAGES for split in SPLITS] |
| | FEATURE_TYPES = ["embeddings"] |
| | LABELS = { |
| | "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"], |
| | "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"], |
| | "pharo": [ |
| | "Keyimplementationpoints", |
| | "Example", |
| | "Responsibilities", |
| | "Intent", |
| | "Keymessages", |
| | "Collaborators", |
| | ], |
| | } |
| | OPTUNA_CROSS_VALIDATION_FOLDS = 2 |
| | OPTUNA_TRIALS = 1 |
| | SEED = 42 |
| | SENTENCE_TRANSFORMER_MODELS = { |
| | "paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2", |
| | } |
| |
|
| | |
| | |
| | try: |
| | from tqdm import tqdm |
| |
|
| | logger.remove(0) |
| | logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) |
| | except ModuleNotFoundError: |
| | pass |
| |
|