Spaces:
Sleeping
Sleeping
| # Tokenization/label_tokens.py | |
| # Domain tags | |
| DOMAIN_TAGS = { | |
| "physics": "[PHYS]", | |
| "biology": "[BIO]", | |
| "materials": "[MAT]", | |
| "education": "[GEN]", | |
| } | |
| # Task tags | |
| TASK_TAGS = { | |
| "hypothesis": "[HYP]", | |
| "method": "[MTH]", | |
| "experiment": "[EXP]", | |
| } | |
| # Section tags (for further granularity, e.g., for long-context or future models) | |
| SECTION_TAGS = { | |
| "abstract": "[ABSTRACT]", | |
| "introduction": "[INTRO]", | |
| "results": "[RESULTS]", | |
| "discussion": "[DISCUSSION]", | |
| "conclusion": "[CONCLUSION]", | |
| "method": "[MTH]", | |
| "experiment": "[EXP]", | |
| } | |
| # Routing tags | |
| ROUTING_TAGS = { | |
| "general": "[GEN]", | |
| "specific": "[SPEC]", | |
| } | |
| # Token/word limits for validation and filtering | |
| MIN_WORDS = 8 | |
| MAX_TOKENS = 1024 | |
| MAX_TOTAL_TOKENS = 327680000 # Example: 325M tokens | |
| # Token targets for different corpus types | |
| TOKEN_TARGETS = { | |
| "warm_start": 100_000_000, | |
| "scientific": 225_000_000, | |
| "instruction": 30_000_000, | |
| "default": 325_000_000, | |
| } | |
| def build_tag_string( | |
| domain: str, | |
| task: str = None, | |
| section: str = None, | |
| routing: str = "general", | |
| subdomain: str = None | |
| ) -> str: | |
| """ | |
| Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics] | |
| """ | |
| tags = [] | |
| if domain in DOMAIN_TAGS: | |
| tags.append(DOMAIN_TAGS[domain]) | |
| if task in TASK_TAGS: | |
| tags.append(TASK_TAGS[task]) | |
| if section in SECTION_TAGS: | |
| tags.append(SECTION_TAGS[section]) | |
| if routing == "general": | |
| tags.append(ROUTING_TAGS["general"]) | |
| elif routing == "specific" and subdomain: | |
| tags.append(f"[SPEC:{subdomain}]") | |
| return "".join(tags) | |