Upload folder using huggingface_hub
Browse files- README.md +11 -9
- nedo_turkish_tokenizer/_suffix_expander.py +3 -3
README.md
CHANGED
|
@@ -105,14 +105,16 @@ Input: `"İSTANBUL'da meeting'e katılamadım"`
|
|
| 105 |
|
| 106 |
| token | token_type | morph_pos | notes |
|
| 107 |
|---|---|---|---|
|
| 108 |
-
| `<uppercase_word>` | ROOT | 0 | ALL CAPS marker |
|
| 109 |
| ` istanbul` | ROOT | 0 | lowercased |
|
| 110 |
-
| `
|
| 111 |
-
| `
|
| 112 |
-
| `
|
| 113 |
-
| `
|
| 114 |
-
| `
|
| 115 |
-
| `
|
|
|
|
|
|
|
| 116 |
|
| 117 |
---
|
| 118 |
|
|
@@ -123,8 +125,8 @@ Every token dict contains:
|
|
| 123 |
| Field | Type | Description |
|
| 124 |
|---|---|---|
|
| 125 |
| `token` | `str` | Token string — leading space means word-initial |
|
| 126 |
-
| `token_type` | `str` | Morphological type (
|
| 127 |
-
| `morph_pos` | `int` | Position within word: `0`=root, `1`=1st suffix, `2`=2nd suffix... |
|
| 128 |
|
| 129 |
### Token Types
|
| 130 |
|
|
|
|
| 105 |
|
| 106 |
| token | token_type | morph_pos | notes |
|
| 107 |
|---|---|---|---|
|
| 108 |
+
| `<uppercase_word>` | ROOT | 0 | ALL CAPS marker (Fix 1) |
|
| 109 |
| ` istanbul` | ROOT | 0 | lowercased |
|
| 110 |
+
| `'` | PUNCT | 0 | Fixed boundary |
|
| 111 |
+
| `da` | SUFFIX | 1 | `-LOC` [LOC] |
|
| 112 |
+
| ` meeting` | FOREIGN | 0 | TDK lookup (Fix 7) |
|
| 113 |
+
| `e` | SUFFIX | 1 | `-DAT` [DAT] |
|
| 114 |
+
| ` katılmak` | ROOT | 0 | Root corrected (Fix 4) |
|
| 115 |
+
| `lama` | SUFFIX | 1 | `-VN+NEG` |
|
| 116 |
+
| `d` | SUFFIX | 2 | `-PAST` |
|
| 117 |
+
| `ım` | SUFFIX | 3 | `-1SG` [1SG] |
|
| 118 |
|
| 119 |
---
|
| 120 |
|
|
|
|
| 125 |
| Field | Type | Description |
|
| 126 |
|---|---|---|
|
| 127 |
| `token` | `str` | Token string — leading space means word-initial |
|
| 128 |
+
| `token_type` | `str` | Morphological type (ROOT, SUFFIX, FOREIGN, PUNCT, etc.) |
|
| 129 |
+
| `morph_pos` | `int` | Position within word: `0`=root/initial, `1`=1st suffix, `2`=2nd suffix... |
|
| 130 |
|
| 131 |
### Token Types
|
| 132 |
|
nedo_turkish_tokenizer/_suffix_expander.py
CHANGED
|
@@ -3,9 +3,9 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
PUNCT_CHARS = set(
|
| 6 |
-
'?.,;:!-\u2013\u2014()[]{}"`/\\|@#$%^&*+=<>~
|
| 7 |
-
|
| 8 |
-
|
| 9 |
)
|
| 10 |
_PUNCT_DIGITS = set("0123456789")
|
| 11 |
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
PUNCT_CHARS = set(
|
| 6 |
+
"'?.,;:!-\u2013\u2014()[]{}\"`/\\|@#$%^&*+=<>~"
|
| 7 |
+
"\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a"
|
| 8 |
+
"\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7"
|
| 9 |
)
|
| 10 |
_PUNCT_DIGITS = set("0123456789")
|
| 11 |
|