omdeep22 commited on
Commit
2828776
·
verified ·
1 Parent(s): 895eea2

Upload Gonyai-TEO2 — Konkani language model (251M)

Browse files
Files changed (1) hide show
  1. modeling_gonyai.py +30 -0
modeling_gonyai.py CHANGED
@@ -370,4 +370,34 @@ class KonkanGPT(PreTrainedModel):
370
  if marker and marker in response:
371
  response = response.split(marker)[0].strip()
372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  return response
 
370
  if marker and marker in response:
371
  response = response.split(marker)[0].strip()
372
 
373
+ # Post-processing: remove English words/phrases in brackets
374
+ # e.g. "निसर्गसंपदा (Mobily)" → "निसर्गसंपदा"
375
+ # e.g. "शार (City of Goa)" → "शार"
376
+ # Keeps Devanagari/numbers/Konkani punctuation in brackets intact
377
+ import re
378
+ def _is_english_content(text):
379
+ """True if text contains mostly Latin characters."""
380
+ latin = sum(1 for c in text if 'a' <= c.lower() <= 'z')
381
+ return latin > len(text) * 0.4
382
+
383
+ def _clean_brackets(text):
384
+ # Remove (English content) — round brackets
385
+ text = re.sub(
386
+ r'\s*\([^)]*\)',
387
+ lambda m: '' if _is_english_content(m.group()) else m.group(),
388
+ text
389
+ )
390
+ # Remove [English content] — square brackets
391
+ text = re.sub(
392
+ r'\s*\[[^\]]*\]',
393
+ lambda m: '' if _is_english_content(m.group()) else m.group(),
394
+ text
395
+ )
396
+ # Clean up extra spaces left behind
397
+ text = re.sub(r' {2,}', ' ', text)
398
+ text = re.sub(r' ([,।.!?])', r'', text)
399
+ return text.strip()
400
+
401
+ response = _clean_brackets(response)
402
+
403
  return response