CRAYON-tokenizer / src /crayon /core /profiles.py
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
"""
Crayon Profile Definitions.
Defines the 'Cartridges' available for the tokenizer ecosystem.
"""
from dataclasses import dataclass, field
from typing import List, Tuple, Optional
@dataclass(frozen=True)
class VocabProfile:
name: str
target_size: int
description: str
# List of (Dataset_Name, Split, [Column_Names])
sources: List[Tuple[str, str, List[str]]]
min_frequency: int = 2
version: str = "v1"
# --- The Production Cartridge Menu ---
PROFILES = {
"lite": VocabProfile(
name="lite",
target_size=50000,
min_frequency=0,
description="Lite profile (tiktoken 50k)",
sources=[]
),
"standard": VocabProfile(
name="standard",
target_size=250000,
min_frequency=0,
description="Standard profile (tiktoken 50k + tiktoken 200k = 250k)",
sources=[]
),
}