File size: 876 Bytes
708f4a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | """
Crayon Profile Definitions.
Defines the 'Cartridges' available for the tokenizer ecosystem.
"""
from dataclasses import dataclass, field
from typing import List, Tuple, Optional
@dataclass(frozen=True)
class VocabProfile:
name: str
target_size: int
description: str
# List of (Dataset_Name, Split, [Column_Names])
sources: List[Tuple[str, str, List[str]]]
min_frequency: int = 2
version: str = "v1"
# --- The Production Cartridge Menu ---
PROFILES = {
"lite": VocabProfile(
name="lite",
target_size=50000,
min_frequency=0,
description="Lite profile (tiktoken 50k)",
sources=[]
),
"standard": VocabProfile(
name="standard",
target_size=250000,
min_frequency=0,
description="Standard profile (tiktoken 50k + tiktoken 200k = 250k)",
sources=[]
),
}
|