|
|
"""Project configuration, paths, constants, and logging setup. |
|
|
|
|
|
This module centralizes project-wide configuration such as directory paths, |
|
|
dataset settings, seeds, and model registries. It also handles environment |
|
|
variable loading from a local `.env` file (if present) and configures logging |
|
|
to play nicely with progress bars. |
|
|
|
|
|
Key attributes |
|
|
-------------- |
|
|
- `PROJ_ROOT`: Absolute path to the project root. |
|
|
- `DATA_DIR`, `RAW_DATA_DIR`, `INTERIM_DATA_DIR`, `PROCESSED_DATA_DIR`, `EXTERNAL_DATA_DIR`: |
|
|
Standard data directories following the Cookiecutter data science layout. |
|
|
- `MODELS_DIR`: Directory where trained models and artifacts are stored. |
|
|
- `REPORTS_DIR`, `FIGURES_DIR`: Reporting outputs and figures. |
|
|
- `DATASET_NAME`: Hugging Face dataset identifier for this project. |
|
|
- `LANGUAGES`, `SPLITS`, `LANGUAGES_SPLITS`: Supported languages and train/test splits. |
|
|
- `FEATURE_TYPES`: Supported feature families (e.g., embeddings). |
|
|
- `LABELS`: Per-language label names in the dataset. |
|
|
- `OPTUNA_CROSS_VALIDATION_FOLDS`, `OPTUNA_TRIALS`: Tuning configuration. |
|
|
- `SEED`: Global random seed. |
|
|
- `SENTENCE_TRANSFORMER_MODELS`: Mapping of local names to HF model IDs. |
|
|
""" |
|
|
|
|
|
from pathlib import Path |
|
|
|
|
|
from dotenv import load_dotenv |
|
|
from loguru import logger |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
PROJ_ROOT = Path(__file__).resolve().parents[1] |
|
|
logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}") |
|
|
|
|
|
DATA_DIR = PROJ_ROOT / "data" |
|
|
RAW_DATA_DIR = DATA_DIR / "raw" |
|
|
INTERIM_DATA_DIR = DATA_DIR / "interim" |
|
|
PROCESSED_DATA_DIR = DATA_DIR / "processed" |
|
|
EXTERNAL_DATA_DIR = DATA_DIR / "external" |
|
|
|
|
|
MODELS_DIR = PROJ_ROOT / "models" |
|
|
|
|
|
REPORTS_DIR = PROJ_ROOT / "reports" |
|
|
FIGURES_DIR = REPORTS_DIR / "figures" |
|
|
|
|
|
|
|
|
DATASET_NAME = "NLBSE/nlbse26-code-comment-classification" |
|
|
LANGUAGES = ["java", "python", "pharo"] |
|
|
SPLITS = ["train", "test"] |
|
|
LANGUAGES_SPLITS = [f"{lang}_{split}" for lang in LANGUAGES for split in SPLITS] |
|
|
FEATURE_TYPES = ["embeddings"] |
|
|
LABELS = { |
|
|
"java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"], |
|
|
"python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"], |
|
|
"pharo": [ |
|
|
"Keyimplementationpoints", |
|
|
"Example", |
|
|
"Responsibilities", |
|
|
"Intent", |
|
|
"Keymessages", |
|
|
"Collaborators", |
|
|
], |
|
|
} |
|
|
OPTUNA_CROSS_VALIDATION_FOLDS = 2 |
|
|
OPTUNA_TRIALS = 1 |
|
|
SEED = 42 |
|
|
SENTENCE_TRANSFORMER_MODELS = { |
|
|
"paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
from tqdm import tqdm |
|
|
|
|
|
logger.remove(0) |
|
|
logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) |
|
|
except ModuleNotFoundError: |
|
|
pass |
|
|
|