harishwar017 commited on
Commit
cb95b8e
Β·
1 Parent(s): bba8b6b

minor change

Browse files
Files changed (1) hide show
  1. app.py +8 -18
app.py CHANGED
@@ -206,7 +206,6 @@ def map_punctuation(tok: str) -> str:
206
  return "."
207
  return tok
208
 
209
-
210
  def tokenize_with_correct_unicode(text: str):
211
  """
212
  Splits text by matching contiguous word tokens (including Devanagari matras)
@@ -218,38 +217,29 @@ def tokenize_with_correct_unicode(text: str):
218
  # and explicitly add \p{M} to capture matras.
219
  return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
220
 
221
- @torch.no_grad()
222
- def transliterate_sentence(model, sentence: str, src_stoi, tgt_itos, device, max_word_len=30):
 
 
 
223
  tokens = tokenize_with_correct_unicode(sentence)
224
- print(tokens)
225
  out_tokens = []
226
 
227
  for tok in tokens:
228
  if is_punctuation_token(tok):
229
- # No change for punctuation
230
  out_tokens.append(map_punctuation(tok))
231
  else:
232
- # 1. Transliterate the Hindi word
233
- translit_word = transliterate_word(model, tok, src_stoi, tgt_itos, max_len=max_word_len)
234
-
235
- # 2. 🌟 POST-PROCESSING: Clean the word before adding it to the list 🌟
236
- # cleaned_word = clean_devanagari_from_roman(translit_word)
237
- out_tokens.append(translit_word)
238
-
239
- # Re-join with simple spacing rules:
240
- # - no space before . , ! ? etc.
241
- # - space elsewhere
242
  result = ""
243
  for i, tok in enumerate(out_tokens):
244
  if i == 0:
245
  result += tok
246
  else:
247
- # Punctuation check is robust as it uses the cleaned token 'tok'
248
  if tok in [".", ",", "!", "?", ";", ":", ")", "”"]:
249
- # no leading space
250
  result += tok
251
  elif result and result[-1] in ["(", "β€œ"]:
252
- # no space after opening bracket/quote
253
  result += tok
254
  else:
255
  result += " " + tok
 
206
  return "."
207
  return tok
208
 
 
209
  def tokenize_with_correct_unicode(text: str):
210
  """
211
  Splits text by matching contiguous word tokens (including Devanagari matras)
 
217
  # and explicitly add \p{M} to capture matras.
218
  return re.findall(r'[\w\p{L}\p{M}\p{N}]+|\S', text, flags=re.UNICODE)
219
 
220
+
221
+ def transliterate_sentence(sentence: str, max_word_len: int = 30) -> str:
222
+ if not sentence.strip():
223
+ return ""
224
+
225
  tokens = tokenize_with_correct_unicode(sentence)
 
226
  out_tokens = []
227
 
228
  for tok in tokens:
229
  if is_punctuation_token(tok):
 
230
  out_tokens.append(map_punctuation(tok))
231
  else:
232
+ out_tokens.append(transliterate_word(tok, max_len=max_word_len))
233
+
234
+ # Simple detokenizer: space before words, no space before . , ! ? etc.
 
 
 
 
 
 
 
235
  result = ""
236
  for i, tok in enumerate(out_tokens):
237
  if i == 0:
238
  result += tok
239
  else:
 
240
  if tok in [".", ",", "!", "?", ";", ":", ")", "”"]:
 
241
  result += tok
242
  elif result and result[-1] in ["(", "β€œ"]:
 
243
  result += tok
244
  else:
245
  result += " " + tok