Spaces:
Sleeping
Sleeping
Update inference.py
Browse files- inference.py +9 -4
inference.py
CHANGED
|
@@ -92,7 +92,6 @@ class SudachiTokenizer:
|
|
| 92 |
tmp = re.sub(r'[0-9]','',normalized_text)
|
| 93 |
tmp = re.sub(r'[0-9]', '', tmp)
|
| 94 |
tmp = re.sub(r'[、。:()「」%『』()?!%→+`.・×,〜~—+=♪/!?]','',tmp)
|
| 95 |
-
tmp = re.sub(r'[。]','',tmp)
|
| 96 |
tmp = re.sub(r'[a-zA-Z]','',tmp)
|
| 97 |
#絵文字除去
|
| 98 |
tmp = re.sub(r'[❓]', "", tmp)
|
|
@@ -110,7 +109,7 @@ class SudachiTokenizer:
|
|
| 110 |
#アルファベットを小文字に統一
|
| 111 |
token_list = [t.lower() for t in token_list]
|
| 112 |
#ひらがなのみの単語を除く
|
| 113 |
-
token_list = [t for t in token_list if not self.kana_re.match(t)]
|
| 114 |
#ストップワード除去
|
| 115 |
token_list = [t for t in token_list if t not in self.stopwords]
|
| 116 |
return token_list
|
|
@@ -189,10 +188,16 @@ def get_word_attn(input_ids, attention_weight) -> Generator[tuple[str, float], N
|
|
| 189 |
|
| 190 |
def classify_ma(sentence: str) -> tuple[int, torch.Tensor, torch.Tensor]:
|
| 191 |
normalized_sentence = normalize("NFKC", sentence)
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
attention_list, output_list = [], []
|
| 194 |
for trained_model in trained_models:
|
| 195 |
-
input_ids, attention, output = f_a([
|
| 196 |
attention_list.append(attention)
|
| 197 |
output_list.append(output)
|
| 198 |
|
|
|
|
| 92 |
tmp = re.sub(r'[0-9]','',normalized_text)
|
| 93 |
tmp = re.sub(r'[0-9]', '', tmp)
|
| 94 |
tmp = re.sub(r'[、。:()「」%『』()?!%→+`.・×,〜~—+=♪/!?]','',tmp)
|
|
|
|
| 95 |
tmp = re.sub(r'[a-zA-Z]','',tmp)
|
| 96 |
#絵文字除去
|
| 97 |
tmp = re.sub(r'[❓]', "", tmp)
|
|
|
|
| 109 |
#アルファベットを小文字に統一
|
| 110 |
token_list = [t.lower() for t in token_list]
|
| 111 |
#ひらがなのみの単語を除く
|
| 112 |
+
#token_list = [t for t in token_list if not self.kana_re.match(t)]
|
| 113 |
#ストップワード除去
|
| 114 |
token_list = [t for t in token_list if t not in self.stopwords]
|
| 115 |
return token_list
|
|
|
|
| 188 |
|
| 189 |
def classify_ma(sentence: str) -> tuple[int, torch.Tensor, torch.Tensor]:
|
| 190 |
normalized_sentence = normalize("NFKC", sentence)
|
| 191 |
+
tmp = re.sub(r'[0-9]','',normalized_sentence)
|
| 192 |
+
tmp = re.sub(r'[0-9]', '', tmp)
|
| 193 |
+
tmp = re.sub(r'[、。:()「」%『』()?!%→+`.・×,〜~—+=♪/!?]','',tmp)
|
| 194 |
+
tmp = re.sub(r'[a-zA-Z]','',tmp)
|
| 195 |
+
#絵文字除去
|
| 196 |
+
tmp = re.sub(r'[❓]', "", tmp)
|
| 197 |
+
|
| 198 |
attention_list, output_list = [], []
|
| 199 |
for trained_model in trained_models:
|
| 200 |
+
input_ids, attention, output = f_a([tmp], tokenizer_c2, trained_model, device)
|
| 201 |
attention_list.append(attention)
|
| 202 |
output_list.append(output)
|
| 203 |
|