Spaces:

awarefy
/

amp

Sleeping

ogawaal commited on Jul 10, 2024

Commit

4b680ce

verified ·

1 Parent(s): 246a58a

Update inference.py

Files changed (1) hide show

inference.py CHANGED Viewed

@@ -92,7 +92,6 @@ class SudachiTokenizer:
         tmp = re.sub(r'[0-9]','',normalized_text)
         tmp = re.sub(r'[０-９]', '', tmp)
         tmp = re.sub(r'[、。：（）「」%『』（）？！％→＋｀.・×,〜～—+＝♪/!?]','',tmp)
-        tmp = re.sub(r'[。]','',tmp)
         tmp = re.sub(r'[a-zA-Z]','',tmp)
         #絵文字除去
         tmp = re.sub(r'[❓]', "", tmp)
@@ -110,7 +109,7 @@ class SudachiTokenizer:
         #アルファベットを小文字に統一
         token_list = [t.lower() for t in token_list]
         #ひらがなのみの単語を除く
-        token_list = [t for t in token_list if not self.kana_re.match(t)]
         #ストップワード除去
         token_list = [t for t in token_list if t not in self.stopwords]
         return token_list
@@ -189,10 +188,16 @@ def get_word_attn(input_ids, attention_weight) -> Generator[tuple[str, float], N
 def classify_ma(sentence: str) -> tuple[int, torch.Tensor, torch.Tensor]:
     normalized_sentence = normalize("NFKC", sentence)
     attention_list, output_list = [], []
     for trained_model in trained_models:
-        input_ids, attention, output = f_a([normalized_sentence], tokenizer_c2, trained_model, device)
         attention_list.append(attention)
         output_list.append(output)

         tmp = re.sub(r'[0-9]','',normalized_text)
         tmp = re.sub(r'[０-９]', '', tmp)
         tmp = re.sub(r'[、。：（）「」%『』（）？！％→＋｀.・×,〜～—+＝♪/!?]','',tmp)
         tmp = re.sub(r'[a-zA-Z]','',tmp)
         #絵文字除去
         tmp = re.sub(r'[❓]', "", tmp)
         #アルファベットを小文字に統一
         token_list = [t.lower() for t in token_list]
         #ひらがなのみの単語を除く
+        #token_list = [t for t in token_list if not self.kana_re.match(t)]
         #ストップワード除去
         token_list = [t for t in token_list if t not in self.stopwords]
         return token_list
 def classify_ma(sentence: str) -> tuple[int, torch.Tensor, torch.Tensor]:
     normalized_sentence = normalize("NFKC", sentence)
+    tmp = re.sub(r'[0-9]','',normalized_sentence)
+    tmp = re.sub(r'[０-９]', '', tmp)
+    tmp = re.sub(r'[、。：（）「」%『』（）？！％→＋｀.・×,〜～—+＝♪/!?]','',tmp)
+    tmp = re.sub(r'[a-zA-Z]','',tmp)
+    #絵文字除去
+    tmp = re.sub(r'[❓]', "", tmp)
     attention_list, output_list = [], []
     for trained_model in trained_models:
+        input_ids, attention, output = f_a([tmp], tokenizer_c2, trained_model, device)
         attention_list.append(attention)
         output_list.append(output)