ogawaal commited on
Commit
4b680ce
·
verified ·
1 Parent(s): 246a58a

Update inference.py

Browse files
Files changed (1) hide show
  1. inference.py +9 -4
inference.py CHANGED
@@ -92,7 +92,6 @@ class SudachiTokenizer:
92
  tmp = re.sub(r'[0-9]','',normalized_text)
93
  tmp = re.sub(r'[0-9]', '', tmp)
94
  tmp = re.sub(r'[、。:()「」%『』()?!%→+`.・×,〜~—+=♪/!?]','',tmp)
95
- tmp = re.sub(r'[。]','',tmp)
96
  tmp = re.sub(r'[a-zA-Z]','',tmp)
97
  #絵文字除去
98
  tmp = re.sub(r'[❓]', "", tmp)
@@ -110,7 +109,7 @@ class SudachiTokenizer:
110
  #アルファベットを小文字に統一
111
  token_list = [t.lower() for t in token_list]
112
  #ひらがなのみの単語を除く
113
- token_list = [t for t in token_list if not self.kana_re.match(t)]
114
  #ストップワード除去
115
  token_list = [t for t in token_list if t not in self.stopwords]
116
  return token_list
@@ -189,10 +188,16 @@ def get_word_attn(input_ids, attention_weight) -> Generator[tuple[str, float], N
189
 
190
  def classify_ma(sentence: str) -> tuple[int, torch.Tensor, torch.Tensor]:
191
  normalized_sentence = normalize("NFKC", sentence)
192
-
 
 
 
 
 
 
193
  attention_list, output_list = [], []
194
  for trained_model in trained_models:
195
- input_ids, attention, output = f_a([normalized_sentence], tokenizer_c2, trained_model, device)
196
  attention_list.append(attention)
197
  output_list.append(output)
198
 
 
92
  tmp = re.sub(r'[0-9]','',normalized_text)
93
  tmp = re.sub(r'[0-9]', '', tmp)
94
  tmp = re.sub(r'[、。:()「」%『』()?!%→+`.・×,〜~—+=♪/!?]','',tmp)
 
95
  tmp = re.sub(r'[a-zA-Z]','',tmp)
96
  #絵文字除去
97
  tmp = re.sub(r'[❓]', "", tmp)
 
109
  #アルファベットを小文字に統一
110
  token_list = [t.lower() for t in token_list]
111
  #ひらがなのみの単語を除く
112
+ #token_list = [t for t in token_list if not self.kana_re.match(t)]
113
  #ストップワード除去
114
  token_list = [t for t in token_list if t not in self.stopwords]
115
  return token_list
 
188
 
189
  def classify_ma(sentence: str) -> tuple[int, torch.Tensor, torch.Tensor]:
190
  normalized_sentence = normalize("NFKC", sentence)
191
+ tmp = re.sub(r'[0-9]','',normalized_sentence)
192
+ tmp = re.sub(r'[0-9]', '', tmp)
193
+ tmp = re.sub(r'[、。:()「」%『』()?!%→+`.・×,〜~—+=♪/!?]','',tmp)
194
+ tmp = re.sub(r'[a-zA-Z]','',tmp)
195
+ #絵文字除去
196
+ tmp = re.sub(r'[❓]', "", tmp)
197
+
198
  attention_list, output_list = [], []
199
  for trained_model in trained_models:
200
+ input_ids, attention, output = f_a([tmp], tokenizer_c2, trained_model, device)
201
  attention_list.append(attention)
202
  output_list.append(output)
203