Commit
Β·
cb95b8e
1
Parent(s):
bba8b6b
minor change
Browse files
app.py
CHANGED
|
@@ -206,7 +206,6 @@ def map_punctuation(tok: str) -> str:
|
|
| 206 |
return "."
|
| 207 |
return tok
|
| 208 |
|
| 209 |
-
|
| 210 |
def tokenize_with_correct_unicode(text: str):
|
| 211 |
"""
|
| 212 |
Splits text by matching contiguous word tokens (including Devanagari matras)
|
|
@@ -218,38 +217,29 @@ def tokenize_with_correct_unicode(text: str):
|
|
| 218 |
# and explicitly add \p{M} to capture matras.
|
| 219 |
return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
|
| 220 |
|
| 221 |
-
|
| 222 |
-
def transliterate_sentence(
|
|
|
|
|
|
|
|
|
|
| 223 |
tokens = tokenize_with_correct_unicode(sentence)
|
| 224 |
-
print(tokens)
|
| 225 |
out_tokens = []
|
| 226 |
|
| 227 |
for tok in tokens:
|
| 228 |
if is_punctuation_token(tok):
|
| 229 |
-
# No change for punctuation
|
| 230 |
out_tokens.append(map_punctuation(tok))
|
| 231 |
else:
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
# 2. π POST-PROCESSING: Clean the word before adding it to the list π
|
| 236 |
-
# cleaned_word = clean_devanagari_from_roman(translit_word)
|
| 237 |
-
out_tokens.append(translit_word)
|
| 238 |
-
|
| 239 |
-
# Re-join with simple spacing rules:
|
| 240 |
-
# - no space before . , ! ? etc.
|
| 241 |
-
# - space elsewhere
|
| 242 |
result = ""
|
| 243 |
for i, tok in enumerate(out_tokens):
|
| 244 |
if i == 0:
|
| 245 |
result += tok
|
| 246 |
else:
|
| 247 |
-
# Punctuation check is robust as it uses the cleaned token 'tok'
|
| 248 |
if tok in [".", ",", "!", "?", ";", ":", ")", "β"]:
|
| 249 |
-
# no leading space
|
| 250 |
result += tok
|
| 251 |
elif result and result[-1] in ["(", "β"]:
|
| 252 |
-
# no space after opening bracket/quote
|
| 253 |
result += tok
|
| 254 |
else:
|
| 255 |
result += " " + tok
|
|
|
|
| 206 |
return "."
|
| 207 |
return tok
|
| 208 |
|
|
|
|
| 209 |
def tokenize_with_correct_unicode(text: str):
|
| 210 |
"""
|
| 211 |
Splits text by matching contiguous word tokens (including Devanagari matras)
|
|
|
|
| 217 |
# and explicitly add \p{M} to capture matras.
|
| 218 |
return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
|
| 219 |
|
| 220 |
+
|
| 221 |
+
def transliterate_sentence(sentence: str, max_word_len: int = 30) -> str:
|
| 222 |
+
if not sentence.strip():
|
| 223 |
+
return ""
|
| 224 |
+
|
| 225 |
tokens = tokenize_with_correct_unicode(sentence)
|
|
|
|
| 226 |
out_tokens = []
|
| 227 |
|
| 228 |
for tok in tokens:
|
| 229 |
if is_punctuation_token(tok):
|
|
|
|
| 230 |
out_tokens.append(map_punctuation(tok))
|
| 231 |
else:
|
| 232 |
+
out_tokens.append(transliterate_word(tok, max_len=max_word_len))
|
| 233 |
+
|
| 234 |
+
# Simple detokenizer: space before words, no space before . , ! ? etc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
result = ""
|
| 236 |
for i, tok in enumerate(out_tokens):
|
| 237 |
if i == 0:
|
| 238 |
result += tok
|
| 239 |
else:
|
|
|
|
| 240 |
if tok in [".", ",", "!", "?", ";", ":", ")", "β"]:
|
|
|
|
| 241 |
result += tok
|
| 242 |
elif result and result[-1] in ["(", "β"]:
|
|
|
|
| 243 |
result += tok
|
| 244 |
else:
|
| 245 |
result += " " + tok
|