File size: 876 Bytes
708f4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
"""
Crayon Profile Definitions.
Defines the 'Cartridges' available for the tokenizer ecosystem.
"""
from dataclasses import dataclass, field
from typing import List, Tuple, Optional

@dataclass(frozen=True)
class VocabProfile:
    name: str
    target_size: int
    description: str
    # List of (Dataset_Name, Split, [Column_Names])
    sources: List[Tuple[str, str, List[str]]]
    min_frequency: int = 2
    version: str = "v1"

# --- The Production Cartridge Menu ---
PROFILES = {
    "lite": VocabProfile(
        name="lite",
        target_size=50000,
        min_frequency=0,
        description="Lite profile (tiktoken 50k)",
        sources=[]
    ),
    "standard": VocabProfile(
        name="standard",
        target_size=250000,
        min_frequency=0,
        description="Standard profile (tiktoken 50k + tiktoken 200k = 250k)",
        sources=[]
    ),
}