Spaces:
Running
Running
| import logging | |
| import os | |
| from pathlib import Path | |
| import sys | |
| from dotenv import load_dotenv | |
| from loguru import logger | |
| try: | |
| from logtail import LogtailHandler | |
| except ImportError: | |
| LogtailHandler = None # Logtail not available in this environment | |
| # Load environment variables from .env file if it exists | |
| load_dotenv() | |
| # Paths | |
| PROJ_ROOT = Path(__file__).resolve().parents[1] | |
| logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}") | |
| DATA_DIR = PROJ_ROOT / "data" | |
| RAW_DATA_DIR = DATA_DIR / "raw" | |
| INTERIM_DATA_DIR = DATA_DIR / "interim" | |
| PROCESSED_DATA_DIR = DATA_DIR / "processed" | |
| EXTERNAL_DATA_DIR = DATA_DIR / "external" | |
| MODELS_DIR = PROJ_ROOT / "models" | |
| REPORTS_DIR = PROJ_ROOT / "reports" | |
| FIGURES_DIR = REPORTS_DIR / "figures" | |
| # Dataset | |
| DATASET_HF_ID = "NLBSE/nlbse26-code-comment-classification" | |
| LANGS = ["java", "python", "pharo"] | |
| INPUT_COLUMN = "combo" | |
| LABEL_COLUMN = "labels" | |
| LABELS_MAP = { | |
| "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"], | |
| "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"], | |
| "pharo": [ | |
| "Keyimplementationpoints", | |
| "Example", | |
| "Responsibilities", | |
| "Intent", | |
| "Keymessages", | |
| "Collaborators", | |
| ], | |
| } | |
| TOTAL_CATEGORIES = sum(len(v) for v in LABELS_MAP.values()) | |
| # Score parameters | |
| MAX_AVG_RUNTIME = 5.0 # seconds | |
| MAX_AVG_FLOPS = 5000.0 # GFLOPS | |
| # Training parameters | |
| DEFAULT_BATCH_SIZE = 32 | |
| # Drift detection parameters | |
| DRIFT_P_VALUE_THRESHOLD = 0.05 # P-value threshold for drift detection warning | |
| DRIFT_ALERT_THRESHOLD = 0.01 # P-value threshold for drift alert (critical) | |
| BASELINE_CACHE_DIR = Path.home() / ".turing_baselines" # Local cache for baseline statistics | |
| DRIFT_DETECTION_ENABLED = True # Enable/disable drift detection globally | |
| # Model configuration mapping | |
| MODEL_CONFIG = { | |
| "codeberta": { | |
| "model_name": "fine-tuned-CodeBERTa", | |
| "exp_name": "fine-tuned-CodeBERTa", | |
| "model_class_module": "turing.modeling.models.codeBerta", | |
| "model_class_name": "CodeBERTa", | |
| }, | |
| "graphcodebert": { | |
| "model_name": "GraphCodeBERT", | |
| "exp_name": "fine-tuned-GraphCodeBERT", | |
| "model_class_module": "turing.modeling.models.graphCodeBert", | |
| "model_class_name": "GraphCodeBERTClassifier", | |
| }, | |
| "tinybert": { | |
| "model_name": "TinyBERT", | |
| "exp_name": "fine-tuned-TinyBERT", | |
| "model_class_module": "turing.modeling.models.tinyBert", | |
| "model_class_name": "TinyBERTClassifier", | |
| }, | |
| "randomforest": { | |
| "model_name": "RandomForest-TfIdf", | |
| "exp_name": "RandomForest-TfIdf", | |
| "model_class_module": "turing.modeling.models.randomForestTfIdf", | |
| "model_class_name": "RandomForestTfIdf", | |
| }, | |
| "minilm": { | |
| "model_name": "MiniLM", | |
| "exp_name": "fine-tuned-MiniLm", | |
| "model_class_module": "turing.modeling.models.miniLM", | |
| "model_class_name": "MiniLMModel", | |
| }, | |
| "deberta": { | |
| "model_name": "DeBERTa-v3-xsmall-raw", | |
| "exp_name": "fine-tuned-DeBERTa", | |
| "model_class_module": "turing.modeling.models.DeBERTa", | |
| "model_class_name": "DebertaXSmall", | |
| }, | |
| } | |
| DEFAULT_NUM_ITERATIONS = 20 | |
| # Existing model modules | |
| EXISTING_MODELS = [ | |
| "randomForestTfIdf", | |
| "codeBerta", | |
| "deBERTa", | |
| ] | |
| # If tqdm is installed, configure loguru with tqdm.write | |
| # https://github.com/Delgan/loguru/issues/135 | |
| try: | |
| from tqdm import tqdm | |
| logger.remove(0) | |
| logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) | |
| except (ModuleNotFoundError, ValueError): | |
| pass | |
| # setup logging for Better Stack using LogtailHandler | |
| try: | |
| if LogtailHandler and os.getenv("BETTER_STACK_TOKEN") and os.getenv("BETTER_STACK_HOST"): | |
| better_stack_handler = LogtailHandler( | |
| source_token=os.getenv("BETTER_STACK_TOKEN", ""), | |
| host=os.getenv("BETTER_STACK_HOST", ""), | |
| ) | |
| root_logger = logging.getLogger() | |
| root_logger.setLevel(logging.INFO) | |
| console_handler = logging.StreamHandler(sys.stdout) | |
| console_handler.setLevel(logging.DEBUG) | |
| better_stack_handler.setLevel(logging.WARNING) | |
| root_logger.addHandler(console_handler) | |
| root_logger.addHandler(better_stack_handler) | |
| logging.info("LogtailHandler for Better Stack configured successfully.") | |
| except Exception as e: | |
| logging.error(f"Failed to configure LogtailHandler: {e}") |