Executor-Tyrant-Framework commited on
Commit
8e2ec90
·
verified ·
1 Parent(s): 6c53a1d

Sync from GitHub: 0c5737d7108460e1c2b09e575eccf57cc50766be

Browse files
Files changed (1) hide show
  1. grammars/concepts.gbnf +10 -1
grammars/concepts.gbnf CHANGED
@@ -28,5 +28,14 @@
28
 
29
  root ::= item ("," ws item){0,7}
30
  item ::= word (ws word){0,3}
31
- word ::= [a-zA-Z] [a-zA-Z0-9-]{2,19}
 
 
 
 
 
 
 
 
 
32
  ws ::= " "
 
28
 
29
  root ::= item ("," ws item){0,7}
30
  item ::= word (ws word){0,3}
31
+ # Word: first char is any letter (allows TitleCase proper nouns
32
+ # like "Calvin" or "RSA"); body is lowercase + digits + hyphen only.
33
+ # Mid-word capitals are forbidden. This forces Falcon3's BPE
34
+ # tokenizer to emit space-prefixed continuation tokens (" Sunlight")
35
+ # rather than jamming tokens together as one camelCase "word"
36
+ # ("thatGreenhouseCarbon"). Per arXiv 2502.14969, space-prefixed
37
+ # tokens have better-trained embeddings (5-10% gain), which is
38
+ # particularly important for smaller/quantized models like
39
+ # Falcon3-10B-1.58bit.
40
+ word ::= [a-zA-Z] [a-z0-9-]{2,19}
41
  ws ::= " "