Spaces:

se4ai2526-uniba-nygaard
/

NygaardCodeComment-backend

Sleeping

App Files Files Community

NygaardCodeComment-backend / nygaardcodecommentclassification /config.py

Davy592

First commit

713632e 2 months ago

raw

history blame contribute delete

2.78 kB

	"""Project configuration, paths, constants, and logging setup.

	This module centralizes project-wide configuration such as directory paths,
	dataset settings, seeds, and model registries. It also handles environment
	variable loading from a local `.env` file (if present) and configures logging
	to play nicely with progress bars.

	Key attributes
	--------------
	- `PROJ_ROOT`: Absolute path to the project root.
	- `DATA_DIR`, `RAW_DATA_DIR`, `INTERIM_DATA_DIR`, `PROCESSED_DATA_DIR`, `EXTERNAL_DATA_DIR`:
	Standard data directories following the Cookiecutter data science layout.
	- `MODELS_DIR`: Directory where trained models and artifacts are stored.
	- `REPORTS_DIR`, `FIGURES_DIR`: Reporting outputs and figures.
	- `DATASET_NAME`: Hugging Face dataset identifier for this project.
	- `LANGUAGES`, `SPLITS`, `LANGUAGES_SPLITS`: Supported languages and train/test splits.
	- `FEATURE_TYPES`: Supported feature families (e.g., embeddings).
	- `LABELS`: Per-language label names in the dataset.
	- `OPTUNA_CROSS_VALIDATION_FOLDS`, `OPTUNA_TRIALS`: Tuning configuration.
	- `SEED`: Global random seed.
	- `SENTENCE_TRANSFORMER_MODELS`: Mapping of local names to HF model IDs.
	"""

	from pathlib import Path

	from dotenv import load_dotenv
	from loguru import logger

	# Load environment variables from .env file if it exists
	load_dotenv()

	# Paths
	PROJ_ROOT = Path(__file__).resolve().parents[1]
	logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")

	DATA_DIR = PROJ_ROOT / "data"
	RAW_DATA_DIR = DATA_DIR / "raw"
	INTERIM_DATA_DIR = DATA_DIR / "interim"
	PROCESSED_DATA_DIR = DATA_DIR / "processed"
	EXTERNAL_DATA_DIR = DATA_DIR / "external"

	MODELS_DIR = PROJ_ROOT / "models"

	REPORTS_DIR = PROJ_ROOT / "reports"
	FIGURES_DIR = REPORTS_DIR / "figures"

	# Defined by us
	DATASET_NAME = "NLBSE/nlbse26-code-comment-classification"
	LANGUAGES = ["java", "python", "pharo"]
	SPLITS = ["train", "test"]
	LANGUAGES_SPLITS = [f"{lang}_{split}" for lang in LANGUAGES for split in SPLITS]
	FEATURE_TYPES = ["embeddings"]
	LABELS = {
	"java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
	"python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
	"pharo": [
	"Keyimplementationpoints",
	"Example",
	"Responsibilities",
	"Intent",
	"Keymessages",
	"Collaborators",
	],
	}
	OPTUNA_CROSS_VALIDATION_FOLDS = 2
	OPTUNA_TRIALS = 1
	SEED = 42
	SENTENCE_TRANSFORMER_MODELS = {
	"paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2",
	}

	# If tqdm is installed, configure loguru with tqdm.write
	# https://github.com/Delgan/loguru/issues/135
	try:
	from tqdm import tqdm

	logger.remove(0)
	logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
	except ModuleNotFoundError:
	pass