from pathlib import Path import re ROOT = Path(__file__).parent DATA_DIR = ROOT / "data" MODEL_DIR = ROOT / "model" RAW_DATA_PATH = ROOT / "arxivData.json" SEED = 42 BATCH_SIZE = 16 NUM_EPOCHS = 10 VAL_RATIO = 0.1 TEST_RATIO = 0.1 LEARNING_RATE = 1e-3 MAX_LENGTH = 512 def _load_taxonomy(path): tag_names = {} for line in open(path): line = line.strip() if not line: continue regex_tag_and_name = re.match(r"^([\w.-]+)\s+\((.+)\)$", line) if regex_tag_and_name: tag_names[regex_tag_and_name.group(1)] = regex_tag_and_name.group(2) return tag_names TAG_NAMES = _load_taxonomy(ROOT / "taxonomy.txt") def get_tag_name(tag): if tag in TAG_NAMES: return TAG_NAMES[tag] prefix = tag.split(".")[0] if "." in tag else tag if prefix in TAG_NAMES: return TAG_NAMES[prefix] return tag