import logging import os from pathlib import Path import sys from dotenv import load_dotenv from loguru import logger try: from logtail import LogtailHandler except ImportError: LogtailHandler = None # Logtail not available in this environment # Load environment variables from .env file if it exists load_dotenv() # Paths PROJ_ROOT = Path(__file__).resolve().parents[1] logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}") DATA_DIR = PROJ_ROOT / "data" RAW_DATA_DIR = DATA_DIR / "raw" INTERIM_DATA_DIR = DATA_DIR / "interim" PROCESSED_DATA_DIR = DATA_DIR / "processed" EXTERNAL_DATA_DIR = DATA_DIR / "external" MODELS_DIR = PROJ_ROOT / "models" REPORTS_DIR = PROJ_ROOT / "reports" FIGURES_DIR = REPORTS_DIR / "figures" # Dataset DATASET_HF_ID = "NLBSE/nlbse26-code-comment-classification" LANGS = ["java", "python", "pharo"] INPUT_COLUMN = "combo" LABEL_COLUMN = "labels" LABELS_MAP = { "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"], "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"], "pharo": [ "Keyimplementationpoints", "Example", "Responsibilities", "Intent", "Keymessages", "Collaborators", ], } TOTAL_CATEGORIES = sum(len(v) for v in LABELS_MAP.values()) # Score parameters MAX_AVG_RUNTIME = 5.0 # seconds MAX_AVG_FLOPS = 5000.0 # GFLOPS # Training parameters DEFAULT_BATCH_SIZE = 32 # Drift detection parameters DRIFT_P_VALUE_THRESHOLD = 0.05 # P-value threshold for drift detection warning DRIFT_ALERT_THRESHOLD = 0.01 # P-value threshold for drift alert (critical) BASELINE_CACHE_DIR = Path.home() / ".turing_baselines" # Local cache for baseline statistics DRIFT_DETECTION_ENABLED = True # Enable/disable drift detection globally # Model configuration mapping MODEL_CONFIG = { "codeberta": { "model_name": "fine-tuned-CodeBERTa", "exp_name": "fine-tuned-CodeBERTa", "model_class_module": "turing.modeling.models.codeBerta", "model_class_name": "CodeBERTa", }, "graphcodebert": { "model_name": "GraphCodeBERT", "exp_name": "fine-tuned-GraphCodeBERT", "model_class_module": "turing.modeling.models.graphCodeBert", "model_class_name": "GraphCodeBERTClassifier", }, "tinybert": { "model_name": "TinyBERT", "exp_name": "fine-tuned-TinyBERT", "model_class_module": "turing.modeling.models.tinyBert", "model_class_name": "TinyBERTClassifier", }, "randomforest": { "model_name": "RandomForest-TfIdf", "exp_name": "RandomForest-TfIdf", "model_class_module": "turing.modeling.models.randomForestTfIdf", "model_class_name": "RandomForestTfIdf", }, "minilm": { "model_name": "MiniLM", "exp_name": "fine-tuned-MiniLm", "model_class_module": "turing.modeling.models.miniLM", "model_class_name": "MiniLMModel", }, "deberta": { "model_name": "DeBERTa-v3-xsmall-raw", "exp_name": "fine-tuned-DeBERTa", "model_class_module": "turing.modeling.models.DeBERTa", "model_class_name": "DebertaXSmall", }, } DEFAULT_NUM_ITERATIONS = 20 # Existing model modules EXISTING_MODELS = [ "randomForestTfIdf", "codeBerta", "deBERTa", ] # If tqdm is installed, configure loguru with tqdm.write # https://github.com/Delgan/loguru/issues/135 try: from tqdm import tqdm logger.remove(0) logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) except (ModuleNotFoundError, ValueError): pass # setup logging for Better Stack using LogtailHandler try: if LogtailHandler and os.getenv("BETTER_STACK_TOKEN") and os.getenv("BETTER_STACK_HOST"): better_stack_handler = LogtailHandler( source_token=os.getenv("BETTER_STACK_TOKEN", ""), host=os.getenv("BETTER_STACK_HOST", ""), ) root_logger = logging.getLogger() root_logger.setLevel(logging.INFO) console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.DEBUG) better_stack_handler.setLevel(logging.WARNING) root_logger.addHandler(console_handler) root_logger.addHandler(better_stack_handler) logging.info("LogtailHandler for Better Stack configured successfully.") except Exception as e: logging.error(f"Failed to configure LogtailHandler: {e}")