Navya-Sree commited on
Commit
bddb49d
·
verified ·
1 Parent(s): ac649df

Update cultural_tokenizer.py

Browse files
Files changed (1) hide show
  1. cultural_tokenizer.py +11 -16
cultural_tokenizer.py CHANGED
@@ -1,12 +1,10 @@
1
  from transformers import M2M100Tokenizer
2
 
3
  class CulturalTokenizer(M2M100Tokenizer):
4
- """Custom tokenizer with UNESCO language support"""
5
 
6
  def __init__(self, *args, **kwargs):
7
  super().__init__(*args, **kwargs)
8
-
9
- # Add special tokens
10
  self.add_special_tokens({
11
  'additional_special_tokens': [
12
  '<CULTURAL_CONTEXT>',
@@ -14,17 +12,14 @@ class CulturalTokenizer(M2M100Tokenizer):
14
  '<VULNERABLE>'
15
  ]
16
  })
17
-
18
- # Language vitality mapping
19
- self.vitality_tags = {
20
- 'ay': '<VULNERABLE>',
21
- 'chr': '<ENDANGERED>',
22
- 'qu': '<VULNERABLE>'
23
- }
24
 
25
- def prepare_seq2seq_batch(self, *args, **kwargs):
26
- """Add vitality tags to source text"""
27
- src_lang = kwargs.get('src_lang')
28
- if src_lang in self.vitality_tags:
29
- kwargs['src_text'] = f"{self.vitality_tags[src_lang]} {kwargs['src_text']}"
30
- return super().prepare_seq2seq_batch(*args, **kwargs)
 
 
 
 
 
1
  from transformers import M2M100Tokenizer
2
 
3
  class CulturalTokenizer(M2M100Tokenizer):
4
+ """Tokenizer with UNESCO language enhancements"""
5
 
6
  def __init__(self, *args, **kwargs):
7
  super().__init__(*args, **kwargs)
 
 
8
  self.add_special_tokens({
9
  'additional_special_tokens': [
10
  '<CULTURAL_CONTEXT>',
 
12
  '<VULNERABLE>'
13
  ]
14
  })
 
 
 
 
 
 
 
15
 
16
+ def prepare_seq2seq_batch(self, src_text, **kwargs):
17
+ """Add cultural metadata to endangered languages"""
18
+ tgt_lang = kwargs.get('tgt_lang')
19
+
20
+ if tgt_lang in ['ay', 'qu']: # Vulnerable languages
21
+ src_text = f"<VULNERABLE> {src_text}"
22
+ elif tgt_lang in ['chr']: # Endangered languages
23
+ src_text = f"<ENDANGERED> {src_text}"
24
+
25
+ return super().prepare_seq2seq_batch(src_text, **kwargs)