harishwar017 commited on
Commit
bba8b6b
Β·
1 Parent(s): c2ab97b
Files changed (1) hide show
  1. app.py +28 -7
app.py CHANGED
@@ -207,28 +207,49 @@ def map_punctuation(tok: str) -> str:
207
  return tok
208
 
209
 
210
- def transliterate_sentence(sentence: str, max_word_len: int = 30) -> str:
211
- if not sentence.strip():
212
- return ""
 
 
 
 
 
 
 
213
 
214
- tokens = tokenize_with_punct(sentence)
 
 
 
215
  out_tokens = []
216
 
217
  for tok in tokens:
218
  if is_punctuation_token(tok):
 
219
  out_tokens.append(map_punctuation(tok))
220
  else:
221
- out_tokens.append(transliterate_word(tok, max_len=max_word_len))
222
-
223
- # Simple detokenizer: space before words, no space before . , ! ? etc.
 
 
 
 
 
 
 
224
  result = ""
225
  for i, tok in enumerate(out_tokens):
226
  if i == 0:
227
  result += tok
228
  else:
 
229
  if tok in [".", ",", "!", "?", ";", ":", ")", "”"]:
 
230
  result += tok
231
  elif result and result[-1] in ["(", "β€œ"]:
 
232
  result += tok
233
  else:
234
  result += " " + tok
 
207
  return tok
208
 
209
 
210
+ def tokenize_with_correct_unicode(text: str):
211
+ """
212
+ Splits text by matching contiguous word tokens (including Devanagari matras)
213
+
214
+ """
215
+
216
+ # We use a pattern that groups Letters, Marks, and Numbers as a single token.
217
+ # The [a-zA-Z0-9] is redundant if using \p{L}\p{N}, but we keep \w for simplicity
218
+ # and explicitly add \p{M} to capture matras.
219
+ return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
220
 
221
+ @torch.no_grad()
222
+ def transliterate_sentence(model, sentence: str, src_stoi, tgt_itos, device, max_word_len=30):
223
+ tokens = tokenize_with_correct_unicode(sentence)
224
+ print(tokens)
225
  out_tokens = []
226
 
227
  for tok in tokens:
228
  if is_punctuation_token(tok):
229
+ # No change for punctuation
230
  out_tokens.append(map_punctuation(tok))
231
  else:
232
+ # 1. Transliterate the Hindi word
233
+ translit_word = transliterate_word(model, tok, src_stoi, tgt_itos, max_len=max_word_len)
234
+
235
+ # 2. 🌟 POST-PROCESSING: Clean the word before adding it to the list 🌟
236
+ # cleaned_word = clean_devanagari_from_roman(translit_word)
237
+ out_tokens.append(translit_word)
238
+
239
+ # Re-join with simple spacing rules:
240
+ # - no space before . , ! ? etc.
241
+ # - space elsewhere
242
  result = ""
243
  for i, tok in enumerate(out_tokens):
244
  if i == 0:
245
  result += tok
246
  else:
247
+ # Punctuation check is robust as it uses the cleaned token 'tok'
248
  if tok in [".", ",", "!", "?", ";", ":", ")", "”"]:
249
+ # no leading space
250
  result += tok
251
  elif result and result[-1] in ["(", "β€œ"]:
252
+ # no space after opening bracket/quote
253
  result += tok
254
  else:
255
  result += " " + tok