Sync from GitHub: 0c5737d7108460e1c2b09e575eccf57cc50766be
Browse files- grammars/concepts.gbnf +10 -1
grammars/concepts.gbnf
CHANGED
|
@@ -28,5 +28,14 @@
|
|
| 28 |
|
| 29 |
root ::= item ("," ws item){0,7}
|
| 30 |
item ::= word (ws word){0,3}
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
ws ::= " "
|
|
|
|
| 28 |
|
| 29 |
root ::= item ("," ws item){0,7}
|
| 30 |
item ::= word (ws word){0,3}
|
| 31 |
+
# Word: first char is any letter (allows TitleCase proper nouns
|
| 32 |
+
# like "Calvin" or "RSA"); body is lowercase + digits + hyphen only.
|
| 33 |
+
# Mid-word capitals are forbidden. This forces Falcon3's BPE
|
| 34 |
+
# tokenizer to emit space-prefixed continuation tokens (" Sunlight")
|
| 35 |
+
# rather than jamming tokens together as one camelCase "word"
|
| 36 |
+
# ("thatGreenhouseCarbon"). Per arXiv 2502.14969, space-prefixed
|
| 37 |
+
# tokens have better-trained embeddings (5-10% gain), which is
|
| 38 |
+
# particularly important for smaller/quantized models like
|
| 39 |
+
# Falcon3-10B-1.58bit.
|
| 40 |
+
word ::= [a-zA-Z] [a-z0-9-]{2,19}
|
| 41 |
ws ::= " "
|