Spaces:

turing-team
/

turing-space

Running

File size: 4,445 Bytes

import logging
import os
from pathlib import Path
import sys

from dotenv import load_dotenv
from loguru import logger

try:
    from logtail import LogtailHandler
except ImportError:
    LogtailHandler = None  # Logtail not available in this environment

# Load environment variables from .env file if it exists
load_dotenv()

# Paths
PROJ_ROOT = Path(__file__).resolve().parents[1]
logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")

DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"

MODELS_DIR = PROJ_ROOT / "models"

REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"

# Dataset
DATASET_HF_ID = "NLBSE/nlbse26-code-comment-classification"
LANGS = ["java", "python", "pharo"]
INPUT_COLUMN = "combo"
LABEL_COLUMN = "labels"

LABELS_MAP = {
    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
    "pharo": [
        "Keyimplementationpoints",
        "Example",
        "Responsibilities",
        "Intent",
        "Keymessages",
        "Collaborators",
    ],
}

TOTAL_CATEGORIES = sum(len(v) for v in LABELS_MAP.values())

# Score parameters
MAX_AVG_RUNTIME = 5.0  # seconds
MAX_AVG_FLOPS = 5000.0  # GFLOPS

# Training parameters
DEFAULT_BATCH_SIZE = 32

# Drift detection parameters
DRIFT_P_VALUE_THRESHOLD = 0.05  # P-value threshold for drift detection warning
DRIFT_ALERT_THRESHOLD = 0.01  # P-value threshold for drift alert (critical)
BASELINE_CACHE_DIR = Path.home() / ".turing_baselines"  # Local cache for baseline statistics
DRIFT_DETECTION_ENABLED = True  # Enable/disable drift detection globally

# Model configuration mapping
MODEL_CONFIG = {
    "codeberta": {
        "model_name": "fine-tuned-CodeBERTa",
        "exp_name": "fine-tuned-CodeBERTa",
        "model_class_module": "turing.modeling.models.codeBerta",
        "model_class_name": "CodeBERTa",
    },
    "graphcodebert": {
        "model_name": "GraphCodeBERT",
        "exp_name": "fine-tuned-GraphCodeBERT",
        "model_class_module": "turing.modeling.models.graphCodeBert",
        "model_class_name": "GraphCodeBERTClassifier",
    },
    "tinybert": {
        "model_name": "TinyBERT",
        "exp_name": "fine-tuned-TinyBERT",
        "model_class_module": "turing.modeling.models.tinyBert",
        "model_class_name": "TinyBERTClassifier",
    },
    "randomforest": {
        "model_name": "RandomForest-TfIdf",
        "exp_name": "RandomForest-TfIdf",
        "model_class_module": "turing.modeling.models.randomForestTfIdf",
        "model_class_name": "RandomForestTfIdf",
    },
    "minilm": {
        "model_name": "MiniLM",
        "exp_name": "fine-tuned-MiniLm",
        "model_class_module": "turing.modeling.models.miniLM",
        "model_class_name": "MiniLMModel",
    },
    "deberta": {
        "model_name": "DeBERTa-v3-xsmall-raw",
        "exp_name": "fine-tuned-DeBERTa",
        "model_class_module": "turing.modeling.models.DeBERTa",
        "model_class_name": "DebertaXSmall",
    },
}
DEFAULT_NUM_ITERATIONS = 20

# Existing model modules
EXISTING_MODELS = [
    "randomForestTfIdf",
    "codeBerta",
    "deBERTa",
]

# If tqdm is installed, configure loguru with tqdm.write
# https://github.com/Delgan/loguru/issues/135
try:
    from tqdm import tqdm

    logger.remove(0)
    logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
except (ModuleNotFoundError, ValueError):
    pass


# setup logging for Better Stack using LogtailHandler
try:
    if LogtailHandler and os.getenv("BETTER_STACK_TOKEN") and os.getenv("BETTER_STACK_HOST"):
        better_stack_handler = LogtailHandler(
            source_token=os.getenv("BETTER_STACK_TOKEN", ""),
            host=os.getenv("BETTER_STACK_HOST", ""),
        )

        root_logger = logging.getLogger()
        root_logger.setLevel(logging.INFO)

        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setLevel(logging.DEBUG)

        better_stack_handler.setLevel(logging.WARNING)

        root_logger.addHandler(console_handler)
        root_logger.addHandler(better_stack_handler)

        logging.info("LogtailHandler for Better Stack configured successfully.")

except Exception as e:
    logging.error(f"Failed to configure LogtailHandler: {e}")