Spaces:

harishwar017
/

transliteration_assignment

Sleeping

App Files Files Community

harishwar017 commited on Dec 9, 2025

Commit

bba8b6b

1 Parent(s): c2ab97b

model

Browse files

Files changed (1) hide show

app.py +28 -7

app.py CHANGED Viewed

@@ -207,28 +207,49 @@ def map_punctuation(tok: str) -> str:
     return tok
-def transliterate_sentence(sentence: str, max_word_len: int = 30) -> str:
-    if not sentence.strip():
-        return ""
-    tokens = tokenize_with_punct(sentence)
     out_tokens = []
     for tok in tokens:
         if is_punctuation_token(tok):
             out_tokens.append(map_punctuation(tok))
         else:
-            out_tokens.append(transliterate_word(tok, max_len=max_word_len))
-    # Simple detokenizer: space before words, no space before . , ! ? etc.
     result = ""
     for i, tok in enumerate(out_tokens):
         if i == 0:
             result += tok
         else:
             if tok in [".", ",", "!", "?", ";", ":", ")", "”"]:
                 result += tok
             elif result and result[-1] in ["(", "“"]:
                 result += tok
             else:
                 result += " " + tok

     return tok
+def tokenize_with_correct_unicode(text: str):
+    """
+    Splits text by matching contiguous word tokens (including Devanagari matras)
+    """
+    # We use a pattern that groups Letters, Marks, and Numbers as a single token.
+    # The [a-zA-Z0-9] is redundant if using \p{L}\p{N}, but we keep \w for simplicity
+    # and explicitly add \p{M} to capture matras.
+    return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
+@torch.no_grad()
+def transliterate_sentence(model, sentence: str, src_stoi, tgt_itos, device, max_word_len=30):
+    tokens = tokenize_with_correct_unicode(sentence)
+    print(tokens)
     out_tokens = []
     for tok in tokens:
         if is_punctuation_token(tok):
+            # No change for punctuation
             out_tokens.append(map_punctuation(tok))
         else:
+            # 1. Transliterate the Hindi word
+            translit_word = transliterate_word(model, tok, src_stoi, tgt_itos, max_len=max_word_len)
+            # 2. 🌟 POST-PROCESSING: Clean the word before adding it to the list 🌟
+            # cleaned_word = clean_devanagari_from_roman(translit_word)
+            out_tokens.append(translit_word)
+    # Re-join with simple spacing rules:
+    #   - no space before . , ! ? etc.
+    #   - space elsewhere
     result = ""
     for i, tok in enumerate(out_tokens):
         if i == 0:
             result += tok
         else:
+            # Punctuation check is robust as it uses the cleaned token 'tok'
             if tok in [".", ",", "!", "?", ";", ":", ")", "”"]:
+                # no leading space
                 result += tok
             elif result and result[-1] in ["(", "“"]:
+                # no space after opening bracket/quote
                 result += tok
             else:
                 result += " " + tok