File size: 2,781 Bytes
713632e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""Project configuration, paths, constants, and logging setup.

This module centralizes project-wide configuration such as directory paths,
dataset settings, seeds, and model registries. It also handles environment
variable loading from a local `.env` file (if present) and configures logging
to play nicely with progress bars.

Key attributes
--------------
- `PROJ_ROOT`: Absolute path to the project root.
- `DATA_DIR`, `RAW_DATA_DIR`, `INTERIM_DATA_DIR`, `PROCESSED_DATA_DIR`, `EXTERNAL_DATA_DIR`:
    Standard data directories following the Cookiecutter data science layout.
- `MODELS_DIR`: Directory where trained models and artifacts are stored.
- `REPORTS_DIR`, `FIGURES_DIR`: Reporting outputs and figures.
- `DATASET_NAME`: Hugging Face dataset identifier for this project.
- `LANGUAGES`, `SPLITS`, `LANGUAGES_SPLITS`: Supported languages and train/test splits.
- `FEATURE_TYPES`: Supported feature families (e.g., embeddings).
- `LABELS`: Per-language label names in the dataset.
- `OPTUNA_CROSS_VALIDATION_FOLDS`, `OPTUNA_TRIALS`: Tuning configuration.
- `SEED`: Global random seed.
- `SENTENCE_TRANSFORMER_MODELS`: Mapping of local names to HF model IDs.
"""

from pathlib import Path

from dotenv import load_dotenv
from loguru import logger

# Load environment variables from .env file if it exists
load_dotenv()

# Paths
PROJ_ROOT = Path(__file__).resolve().parents[1]
logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")

DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"

MODELS_DIR = PROJ_ROOT / "models"

REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"

# Defined by us
DATASET_NAME = "NLBSE/nlbse26-code-comment-classification"
LANGUAGES = ["java", "python", "pharo"]
SPLITS = ["train", "test"]
LANGUAGES_SPLITS = [f"{lang}_{split}" for lang in LANGUAGES for split in SPLITS]
FEATURE_TYPES = ["embeddings"]
LABELS = {
    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
    "pharo": [
        "Keyimplementationpoints",
        "Example",
        "Responsibilities",
        "Intent",
        "Keymessages",
        "Collaborators",
    ],
}
OPTUNA_CROSS_VALIDATION_FOLDS = 2
OPTUNA_TRIALS = 1
SEED = 42
SENTENCE_TRANSFORMER_MODELS = {
    "paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2",
}

# If tqdm is installed, configure loguru with tqdm.write
# https://github.com/Delgan/loguru/issues/135
try:
    from tqdm import tqdm

    logger.remove(0)
    logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
except ModuleNotFoundError:
    pass