Script that spits out every damned hyphen in Unicode, because Gemma loves them
Browse filesI mean, legitimately **banning the fucking hyphen** is damn near the only way to get Gemma models to not-do-the-thing.
- every_hyphen.py +49 -0
every_hyphen.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3 # -*- coding: utf-8; -*-
|
| 2 |
+
|
| 3 |
+
# table sourced from http://jkorpela.fi/dashes.html
|
| 4 |
+
# I know, it has tabs, it's disgusting. Sorry.
|
| 5 |
+
|
| 6 |
+
## - U+002D - hyphen-minus the Ascii hyphen, with multiple usage, or “ambiguous semantic value”; the width should be “average”
|
| 7 |
+
## ~ U+007E ~ tilde the Ascii tilde, with multiple usage; “swung dash”
|
| 8 |
+
## U+00AD ­ soft hyphen “discretionary hyphen”
|
| 9 |
+
## ֊ U+058A ֊ armenian hyphen as soft hyphen, but different in shape
|
| 10 |
+
## ־ U+05BE ־ hebrew punctuation maqaf word hyphen in Hebrew
|
| 11 |
+
## ᐀ U+1400 ᐀ canadian syllabics hyphen used in Canadian Aboriginal Syllabics
|
| 12 |
+
## ᠆ U+1806 ᠆ mongolian todo soft hyphen as soft hyphen, but displayed at the beginning of the second line
|
| 13 |
+
## ‐ U+2010 ‐ hyphen unambiguously a hyphen character, as in “left-to-right”; narrow width
|
| 14 |
+
## ‑ U+2011 ‑ non-breaking hyphen as hyphen (U+2010), but not an allowed line break point
|
| 15 |
+
## ‒ U+2012 ‒ figure dash as hyphen-minus, but has the same width as digits
|
| 16 |
+
## – U+2013 – en dash used e.g. to indicate a range of values
|
| 17 |
+
## — U+2014 — em dash used e.g. to make a break in the flow of a sentence
|
| 18 |
+
## ― U+2015 ― horizontal bar used to introduce quoted text in some typographic styles; “quotation dash”; often (e.g., in the representative glyph in the Unicode standard) longer than em dash
|
| 19 |
+
## ⁓ U+2053 ⁓ swung dash like a large tilde
|
| 20 |
+
## ⁻ U+207B ⁻ superscript minus a compatibility character which is equivalent to minus sign U+2212 in superscript style
|
| 21 |
+
## ₋ U+208B ₋ subscript minus a compatibility character which is equivalent to minus sign U+2212 in subscript style
|
| 22 |
+
## − U+2212 − minus sign an arithmetic operator; the glyph may look the same as the glyph for a hyphen-minus, or may be longer ;
|
| 23 |
+
## ⸗ U+2E17 ⸗ double oblique hyphen used in ancient Near-Eastern linguistics; not in Fraktur, but the glyph of Ascii hyphen or hyphen is similar to this character in Fraktur fonts
|
| 24 |
+
## ⸺ U+2E3A ⸺ two-em dash omission dash<(a>, 2 em units wide
|
| 25 |
+
## ⸻ U+2E3B ⸻ three-em dash used in bibliographies, 3 em units wide
|
| 26 |
+
## 〜 U+301C 〜 wave dash a Chinese/Japanese/Korean character
|
| 27 |
+
## 〰 U+3030 〰 wavy dash a Chinese/Japanese/Korean character
|
| 28 |
+
## ゠ U+30A0 ゠ katakana-hiragana double hyphen in Japasene kana writing
|
| 29 |
+
## ︱ U+FE31 ︱ presentation form for vertical em dash vertical variant of em dash
|
| 30 |
+
## ︲ U+FE32 ︲ presentation form for vertical en dash vertical variant of en dash
|
| 31 |
+
## ﹘ U+FE58 ﹘ small em dash small variant of em dash
|
| 32 |
+
## ﹣ U+FE63 ﹣ small hyphen-minus small variant of Ascii hyphen
|
| 33 |
+
## - U+FF0D - fullwidth hyphen-minus variant of Ascii hyphen for use with CJK characters
|
| 34 |
+
|
| 35 |
+
# for i in "\u002D" "\u007E" "\u00AD" "\u058A" "\u05BE" "\u1400" "\u1806" "\u2010" "\u2011" "\u2012" "\u2013" "\u2014" "\u2015" "\u2053" "\u207B" "\u208B" "\u2212" "\u2E17" "\u2E3A" "\u2E3B" "\u301C" "\u3030" "\u30A0" "\uFE31" "\uFE32" "\uFE58" "\uFE63" "\uFF0D":
|
| 36 |
+
# print(i * 72)
|
| 37 |
+
|
| 38 |
+
import os
|
| 39 |
+
import re
|
| 40 |
+
|
| 41 |
+
with open(os.path.basename(__file__), mode='r') as f:
|
| 42 |
+
line = f.readline()
|
| 43 |
+
r = re.compile(r'^## ')
|
| 44 |
+
|
| 45 |
+
for line in f:
|
| 46 |
+
if r.search(line):
|
| 47 |
+
q = [x.strip() for x in line.split('\t')]
|
| 48 |
+
q[0] = q[0].split()[1]
|
| 49 |
+
print("{:40} {}".format(q[3] + ':', q[0] * 18))
|