Update chunking.py (#5)
Browse files- Update chunking.py (04a10ad976b8d025af1b6047c3a2e92de9f3c74c)
Co-authored-by: Manas Korada <mkorada@users.noreply.huggingface.co>
- chunking.py +9 -1
chunking.py
CHANGED
|
@@ -15,9 +15,17 @@ def get_max_word_length(target_languages: list[str]) -> int:
|
|
| 15 |
"cs": 140,
|
| 16 |
"da": 140,
|
| 17 |
"de": 150,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
}
|
| 19 |
|
| 20 |
-
max_word_length = 700 # Default
|
| 21 |
|
| 22 |
for lang in target_languages:
|
| 23 |
if lang in helsinki_word_limits:
|
|
|
|
| 15 |
"cs": 140,
|
| 16 |
"da": 140,
|
| 17 |
"de": 150,
|
| 18 |
+
"pl": 130, # New Languages Start Here Need To Test
|
| 19 |
+
"mk": 60,
|
| 20 |
+
"es": 50,
|
| 21 |
+
"it": 60,
|
| 22 |
+
"is": 90,
|
| 23 |
+
"tr": 60,
|
| 24 |
+
"sl": 150,
|
| 25 |
+
"no": 50,
|
| 26 |
}
|
| 27 |
|
| 28 |
+
max_word_length = 700 # Default Chunk Size
|
| 29 |
|
| 30 |
for lang in target_languages:
|
| 31 |
if lang in helsinki_word_limits:
|