Spaces:
Running
Running
File size: 4,445 Bytes
0d60ae9 5fc6e5d 0d60ae9 5fc6e5d 5abc469 5fc6e5d 38593e7 5fc6e5d fae8ff7 8e13241 5fc6e5d 8e13241 5fc6e5d 0d60ae9 5abc469 0d60ae9 5abc469 0d60ae9 5abc469 0d60ae9 5abc469 0d60ae9 5abc469 0d60ae9 5abc469 0d60ae9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import logging
import os
from pathlib import Path
import sys
from dotenv import load_dotenv
from loguru import logger
try:
from logtail import LogtailHandler
except ImportError:
LogtailHandler = None # Logtail not available in this environment
# Load environment variables from .env file if it exists
load_dotenv()
# Paths
PROJ_ROOT = Path(__file__).resolve().parents[1]
logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"
MODELS_DIR = PROJ_ROOT / "models"
REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"
# Dataset
DATASET_HF_ID = "NLBSE/nlbse26-code-comment-classification"
LANGS = ["java", "python", "pharo"]
INPUT_COLUMN = "combo"
LABEL_COLUMN = "labels"
LABELS_MAP = {
"java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
"python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
"pharo": [
"Keyimplementationpoints",
"Example",
"Responsibilities",
"Intent",
"Keymessages",
"Collaborators",
],
}
TOTAL_CATEGORIES = sum(len(v) for v in LABELS_MAP.values())
# Score parameters
MAX_AVG_RUNTIME = 5.0 # seconds
MAX_AVG_FLOPS = 5000.0 # GFLOPS
# Training parameters
DEFAULT_BATCH_SIZE = 32
# Drift detection parameters
DRIFT_P_VALUE_THRESHOLD = 0.05 # P-value threshold for drift detection warning
DRIFT_ALERT_THRESHOLD = 0.01 # P-value threshold for drift alert (critical)
BASELINE_CACHE_DIR = Path.home() / ".turing_baselines" # Local cache for baseline statistics
DRIFT_DETECTION_ENABLED = True # Enable/disable drift detection globally
# Model configuration mapping
MODEL_CONFIG = {
"codeberta": {
"model_name": "fine-tuned-CodeBERTa",
"exp_name": "fine-tuned-CodeBERTa",
"model_class_module": "turing.modeling.models.codeBerta",
"model_class_name": "CodeBERTa",
},
"graphcodebert": {
"model_name": "GraphCodeBERT",
"exp_name": "fine-tuned-GraphCodeBERT",
"model_class_module": "turing.modeling.models.graphCodeBert",
"model_class_name": "GraphCodeBERTClassifier",
},
"tinybert": {
"model_name": "TinyBERT",
"exp_name": "fine-tuned-TinyBERT",
"model_class_module": "turing.modeling.models.tinyBert",
"model_class_name": "TinyBERTClassifier",
},
"randomforest": {
"model_name": "RandomForest-TfIdf",
"exp_name": "RandomForest-TfIdf",
"model_class_module": "turing.modeling.models.randomForestTfIdf",
"model_class_name": "RandomForestTfIdf",
},
"minilm": {
"model_name": "MiniLM",
"exp_name": "fine-tuned-MiniLm",
"model_class_module": "turing.modeling.models.miniLM",
"model_class_name": "MiniLMModel",
},
"deberta": {
"model_name": "DeBERTa-v3-xsmall-raw",
"exp_name": "fine-tuned-DeBERTa",
"model_class_module": "turing.modeling.models.DeBERTa",
"model_class_name": "DebertaXSmall",
},
}
DEFAULT_NUM_ITERATIONS = 20
# Existing model modules
EXISTING_MODELS = [
"randomForestTfIdf",
"codeBerta",
"deBERTa",
]
# If tqdm is installed, configure loguru with tqdm.write
# https://github.com/Delgan/loguru/issues/135
try:
from tqdm import tqdm
logger.remove(0)
logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
except (ModuleNotFoundError, ValueError):
pass
# setup logging for Better Stack using LogtailHandler
try:
if LogtailHandler and os.getenv("BETTER_STACK_TOKEN") and os.getenv("BETTER_STACK_HOST"):
better_stack_handler = LogtailHandler(
source_token=os.getenv("BETTER_STACK_TOKEN", ""),
host=os.getenv("BETTER_STACK_HOST", ""),
)
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.DEBUG)
better_stack_handler.setLevel(logging.WARNING)
root_logger.addHandler(console_handler)
root_logger.addHandler(better_stack_handler)
logging.info("LogtailHandler for Better Stack configured successfully.")
except Exception as e:
logging.error(f"Failed to configure LogtailHandler: {e}") |