File size: 4,445 Bytes
0d60ae9
 
5fc6e5d
0d60ae9
5fc6e5d
 
 
 
5abc469
 
 
 
 
5fc6e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38593e7
 
 
 
 
 
5fc6e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fae8ff7
 
 
 
 
 
8e13241
 
 
 
 
 
5fc6e5d
 
 
 
 
 
 
8e13241
5fc6e5d
 
 
 
 
 
 
 
 
 
 
0d60ae9
 
 
 
5abc469
 
 
 
 
0d60ae9
5abc469
 
0d60ae9
5abc469
 
0d60ae9
5abc469
0d60ae9
5abc469
 
0d60ae9
5abc469
0d60ae9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import logging
import os
from pathlib import Path
import sys

from dotenv import load_dotenv
from loguru import logger

try:
    from logtail import LogtailHandler
except ImportError:
    LogtailHandler = None  # Logtail not available in this environment

# Load environment variables from .env file if it exists
load_dotenv()

# Paths
PROJ_ROOT = Path(__file__).resolve().parents[1]
logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")

DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"

MODELS_DIR = PROJ_ROOT / "models"

REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"

# Dataset
DATASET_HF_ID = "NLBSE/nlbse26-code-comment-classification"
LANGS = ["java", "python", "pharo"]
INPUT_COLUMN = "combo"
LABEL_COLUMN = "labels"

LABELS_MAP = {
    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
    "pharo": [
        "Keyimplementationpoints",
        "Example",
        "Responsibilities",
        "Intent",
        "Keymessages",
        "Collaborators",
    ],
}

TOTAL_CATEGORIES = sum(len(v) for v in LABELS_MAP.values())

# Score parameters
MAX_AVG_RUNTIME = 5.0  # seconds
MAX_AVG_FLOPS = 5000.0  # GFLOPS

# Training parameters
DEFAULT_BATCH_SIZE = 32

# Drift detection parameters
DRIFT_P_VALUE_THRESHOLD = 0.05  # P-value threshold for drift detection warning
DRIFT_ALERT_THRESHOLD = 0.01  # P-value threshold for drift alert (critical)
BASELINE_CACHE_DIR = Path.home() / ".turing_baselines"  # Local cache for baseline statistics
DRIFT_DETECTION_ENABLED = True  # Enable/disable drift detection globally

# Model configuration mapping
MODEL_CONFIG = {
    "codeberta": {
        "model_name": "fine-tuned-CodeBERTa",
        "exp_name": "fine-tuned-CodeBERTa",
        "model_class_module": "turing.modeling.models.codeBerta",
        "model_class_name": "CodeBERTa",
    },
    "graphcodebert": {
        "model_name": "GraphCodeBERT",
        "exp_name": "fine-tuned-GraphCodeBERT",
        "model_class_module": "turing.modeling.models.graphCodeBert",
        "model_class_name": "GraphCodeBERTClassifier",
    },
    "tinybert": {
        "model_name": "TinyBERT",
        "exp_name": "fine-tuned-TinyBERT",
        "model_class_module": "turing.modeling.models.tinyBert",
        "model_class_name": "TinyBERTClassifier",
    },
    "randomforest": {
        "model_name": "RandomForest-TfIdf",
        "exp_name": "RandomForest-TfIdf",
        "model_class_module": "turing.modeling.models.randomForestTfIdf",
        "model_class_name": "RandomForestTfIdf",
    },
    "minilm": {
        "model_name": "MiniLM",
        "exp_name": "fine-tuned-MiniLm",
        "model_class_module": "turing.modeling.models.miniLM",
        "model_class_name": "MiniLMModel",
    },
    "deberta": {
        "model_name": "DeBERTa-v3-xsmall-raw",
        "exp_name": "fine-tuned-DeBERTa",
        "model_class_module": "turing.modeling.models.DeBERTa",
        "model_class_name": "DebertaXSmall",
    },
}
DEFAULT_NUM_ITERATIONS = 20

# Existing model modules
EXISTING_MODELS = [
    "randomForestTfIdf",
    "codeBerta",
    "deBERTa",
]

# If tqdm is installed, configure loguru with tqdm.write
# https://github.com/Delgan/loguru/issues/135
try:
    from tqdm import tqdm

    logger.remove(0)
    logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
except (ModuleNotFoundError, ValueError):
    pass


# setup logging for Better Stack using LogtailHandler
try:
    if LogtailHandler and os.getenv("BETTER_STACK_TOKEN") and os.getenv("BETTER_STACK_HOST"):
        better_stack_handler = LogtailHandler(
            source_token=os.getenv("BETTER_STACK_TOKEN", ""),
            host=os.getenv("BETTER_STACK_HOST", ""),
        )

        root_logger = logging.getLogger()
        root_logger.setLevel(logging.INFO)

        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setLevel(logging.DEBUG)

        better_stack_handler.setLevel(logging.WARNING)

        root_logger.addHandler(console_handler)
        root_logger.addHandler(better_stack_handler)

        logging.info("LogtailHandler for Better Stack configured successfully.")

except Exception as e:
    logging.error(f"Failed to configure LogtailHandler: {e}")