Commit Β·
bba8b6b
1
Parent(s): c2ab97b
model
Browse files
app.py
CHANGED
|
@@ -207,28 +207,49 @@ def map_punctuation(tok: str) -> str:
|
|
| 207 |
return tok
|
| 208 |
|
| 209 |
|
| 210 |
-
def
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
| 215 |
out_tokens = []
|
| 216 |
|
| 217 |
for tok in tokens:
|
| 218 |
if is_punctuation_token(tok):
|
|
|
|
| 219 |
out_tokens.append(map_punctuation(tok))
|
| 220 |
else:
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
result = ""
|
| 225 |
for i, tok in enumerate(out_tokens):
|
| 226 |
if i == 0:
|
| 227 |
result += tok
|
| 228 |
else:
|
|
|
|
| 229 |
if tok in [".", ",", "!", "?", ";", ":", ")", "β"]:
|
|
|
|
| 230 |
result += tok
|
| 231 |
elif result and result[-1] in ["(", "β"]:
|
|
|
|
| 232 |
result += tok
|
| 233 |
else:
|
| 234 |
result += " " + tok
|
|
|
|
| 207 |
return tok
|
| 208 |
|
| 209 |
|
| 210 |
+
def tokenize_with_correct_unicode(text: str):
|
| 211 |
+
"""
|
| 212 |
+
Splits text by matching contiguous word tokens (including Devanagari matras)
|
| 213 |
+
|
| 214 |
+
"""
|
| 215 |
+
|
| 216 |
+
# We use a pattern that groups Letters, Marks, and Numbers as a single token.
|
| 217 |
+
# The [a-zA-Z0-9] is redundant if using \p{L}\p{N}, but we keep \w for simplicity
|
| 218 |
+
# and explicitly add \p{M} to capture matras.
|
| 219 |
+
return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
|
| 220 |
|
| 221 |
+
@torch.no_grad()
|
| 222 |
+
def transliterate_sentence(model, sentence: str, src_stoi, tgt_itos, device, max_word_len=30):
|
| 223 |
+
tokens = tokenize_with_correct_unicode(sentence)
|
| 224 |
+
print(tokens)
|
| 225 |
out_tokens = []
|
| 226 |
|
| 227 |
for tok in tokens:
|
| 228 |
if is_punctuation_token(tok):
|
| 229 |
+
# No change for punctuation
|
| 230 |
out_tokens.append(map_punctuation(tok))
|
| 231 |
else:
|
| 232 |
+
# 1. Transliterate the Hindi word
|
| 233 |
+
translit_word = transliterate_word(model, tok, src_stoi, tgt_itos, max_len=max_word_len)
|
| 234 |
+
|
| 235 |
+
# 2. π POST-PROCESSING: Clean the word before adding it to the list π
|
| 236 |
+
# cleaned_word = clean_devanagari_from_roman(translit_word)
|
| 237 |
+
out_tokens.append(translit_word)
|
| 238 |
+
|
| 239 |
+
# Re-join with simple spacing rules:
|
| 240 |
+
# - no space before . , ! ? etc.
|
| 241 |
+
# - space elsewhere
|
| 242 |
result = ""
|
| 243 |
for i, tok in enumerate(out_tokens):
|
| 244 |
if i == 0:
|
| 245 |
result += tok
|
| 246 |
else:
|
| 247 |
+
# Punctuation check is robust as it uses the cleaned token 'tok'
|
| 248 |
if tok in [".", ",", "!", "?", ";", ":", ")", "β"]:
|
| 249 |
+
# no leading space
|
| 250 |
result += tok
|
| 251 |
elif result and result[-1] in ["(", "β"]:
|
| 252 |
+
# no space after opening bracket/quote
|
| 253 |
result += tok
|
| 254 |
else:
|
| 255 |
result += " " + tok
|