Davy592's picture
First commit
713632e
"""Project configuration, paths, constants, and logging setup.
This module centralizes project-wide configuration such as directory paths,
dataset settings, seeds, and model registries. It also handles environment
variable loading from a local `.env` file (if present) and configures logging
to play nicely with progress bars.
Key attributes
--------------
- `PROJ_ROOT`: Absolute path to the project root.
- `DATA_DIR`, `RAW_DATA_DIR`, `INTERIM_DATA_DIR`, `PROCESSED_DATA_DIR`, `EXTERNAL_DATA_DIR`:
Standard data directories following the Cookiecutter data science layout.
- `MODELS_DIR`: Directory where trained models and artifacts are stored.
- `REPORTS_DIR`, `FIGURES_DIR`: Reporting outputs and figures.
- `DATASET_NAME`: Hugging Face dataset identifier for this project.
- `LANGUAGES`, `SPLITS`, `LANGUAGES_SPLITS`: Supported languages and train/test splits.
- `FEATURE_TYPES`: Supported feature families (e.g., embeddings).
- `LABELS`: Per-language label names in the dataset.
- `OPTUNA_CROSS_VALIDATION_FOLDS`, `OPTUNA_TRIALS`: Tuning configuration.
- `SEED`: Global random seed.
- `SENTENCE_TRANSFORMER_MODELS`: Mapping of local names to HF model IDs.
"""
from pathlib import Path
from dotenv import load_dotenv
from loguru import logger
# Load environment variables from .env file if it exists
load_dotenv()
# Paths
PROJ_ROOT = Path(__file__).resolve().parents[1]
logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"
MODELS_DIR = PROJ_ROOT / "models"
REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"
# Defined by us
DATASET_NAME = "NLBSE/nlbse26-code-comment-classification"
LANGUAGES = ["java", "python", "pharo"]
SPLITS = ["train", "test"]
LANGUAGES_SPLITS = [f"{lang}_{split}" for lang in LANGUAGES for split in SPLITS]
FEATURE_TYPES = ["embeddings"]
LABELS = {
"java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
"python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
"pharo": [
"Keyimplementationpoints",
"Example",
"Responsibilities",
"Intent",
"Keymessages",
"Collaborators",
],
}
OPTUNA_CROSS_VALIDATION_FOLDS = 2
OPTUNA_TRIALS = 1
SEED = 42
SENTENCE_TRANSFORMER_MODELS = {
"paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2",
}
# If tqdm is installed, configure loguru with tqdm.write
# https://github.com/Delgan/loguru/issues/135
try:
from tqdm import tqdm
logger.remove(0)
logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
except ModuleNotFoundError:
pass