Spaces:

harishwar017
/

transliteration_assignment

Sleeping

App Files Files Community

harishwar017 commited on Dec 9, 2025

Commit

cb95b8e

1 Parent(s): bba8b6b

minor change

Browse files

Files changed (1) hide show

app.py +8 -18

app.py CHANGED Viewed

@@ -206,7 +206,6 @@ def map_punctuation(tok: str) -> str:
         return "."
     return tok
 def tokenize_with_correct_unicode(text: str):
     """
     Splits text by matching contiguous word tokens (including Devanagari matras)
@@ -218,38 +217,29 @@ def tokenize_with_correct_unicode(text: str):
     # and explicitly add \p{M} to capture matras.
     return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
-@torch.no_grad()
-def transliterate_sentence(model, sentence: str, src_stoi, tgt_itos, device, max_word_len=30):
     tokens = tokenize_with_correct_unicode(sentence)
-    print(tokens)
     out_tokens = []
     for tok in tokens:
         if is_punctuation_token(tok):
-            # No change for punctuation
             out_tokens.append(map_punctuation(tok))
         else:
-            # 1. Transliterate the Hindi word
-            translit_word = transliterate_word(model, tok, src_stoi, tgt_itos, max_len=max_word_len)
-            # 2. 🌟 POST-PROCESSING: Clean the word before adding it to the list 🌟
-            # cleaned_word = clean_devanagari_from_roman(translit_word)
-            out_tokens.append(translit_word)
-    # Re-join with simple spacing rules:
-    #   - no space before . , ! ? etc.
-    #   - space elsewhere
     result = ""
     for i, tok in enumerate(out_tokens):
         if i == 0:
             result += tok
         else:
-            # Punctuation check is robust as it uses the cleaned token 'tok'
             if tok in [".", ",", "!", "?", ";", ":", ")", "”"]:
-                # no leading space
                 result += tok
             elif result and result[-1] in ["(", "“"]:
-                # no space after opening bracket/quote
                 result += tok
             else:
                 result += " " + tok

         return "."
     return tok
 def tokenize_with_correct_unicode(text: str):
     """
     Splits text by matching contiguous word tokens (including Devanagari matras)
     # and explicitly add \p{M} to capture matras.
     return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
+def transliterate_sentence(sentence: str, max_word_len: int = 30) -> str:
+    if not sentence.strip():
+        return ""
     tokens = tokenize_with_correct_unicode(sentence)
     out_tokens = []
     for tok in tokens:
         if is_punctuation_token(tok):
             out_tokens.append(map_punctuation(tok))
         else:
+            out_tokens.append(transliterate_word(tok, max_len=max_word_len))
+    # Simple detokenizer: space before words, no space before . , ! ? etc.
     result = ""
     for i, tok in enumerate(out_tokens):
         if i == 0:
             result += tok
         else:
             if tok in [".", ",", "!", "?", ";", ":", ")", "”"]:
                 result += tok
             elif result and result[-1] in ["(", "“"]:
                 result += tok
             else:
                 result += " " + tok