FireRed Team commited on
Commit
5de5744
·
verified ·
1 Parent(s): 15addad

Update fireredasr2s/fireredasr2/tokenizer/llm_tokenizer.py

Browse files
fireredasr2s/fireredasr2/tokenizer/llm_tokenizer.py CHANGED
@@ -72,10 +72,11 @@ class LlmTokenizerWrapper:
72
  )
73
 
74
  # Padding texts
 
75
  max_len_texts = max([len(text) for text in texts])
76
  if tokenizer.padding_side == "right":
77
  texts = [
78
- text + [tokenizer.pad_token_id] * (max_len_texts - len(text))
79
  for text in texts
80
  ]
81
  else:
@@ -83,7 +84,7 @@ class LlmTokenizerWrapper:
83
  [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text
84
  for text in texts
85
  ]
86
- print(texts)
87
  input_ids = torch.tensor(texts, dtype=torch.int)
88
 
89
  target_ids = input_ids.clone()
 
72
  )
73
 
74
  # Padding texts
75
+ print(1, texts)
76
  max_len_texts = max([len(text) for text in texts])
77
  if tokenizer.padding_side == "right":
78
  texts = [
79
+ list(text) + [tokenizer.pad_token_id] * (max_len_texts - len(text))
80
  for text in texts
81
  ]
82
  else:
 
84
  [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text
85
  for text in texts
86
  ]
87
+ print(2, texts)
88
  input_ids = torch.tensor(texts, dtype=torch.int)
89
 
90
  target_ids = input_ids.clone()