Spaces:
Sleeping
Sleeping
koheibaba commited on
Commit ·
e7a412f
1
Parent(s): 79f44cf
upload files
Browse files- .DS_Store +0 -0
- .gitignore +1 -0
- app.py +92 -0
- input/.DS_Store +0 -0
- input/images/detector.jpg +0 -0
- input/images/user.jpg +0 -0
- input/llm_weights/.DS_Store +0 -0
- input/llm_weights/README.md +20 -0
- input/llm_weights/adapter_config.json +23 -0
- input/llm_weights/runs/.DS_Store +0 -0
- input/llm_weights/runs/Feb12_11-51-08_c9d9b7430adb/events.out.tfevents.1707738699.c9d9b7430adb.2957.1 +0 -0
- input/ng_wordlists/.DS_Store +0 -0
- input/ng_wordlists/ng_wordlist_1_offensive.txt +103 -0
- input/ng_wordlists/ng_wordlist_1_sexual.txt +214 -0
- input/ng_wordlists/ng_wordlist_2_offensive.txt +42 -0
- input/ng_wordlists/ng_wordlist_2_sexual.txt +279 -0
- requirements.txt +11 -0
- utils/find_ng_word.py +69 -0
- utils/llm.py +89 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.bin
|
app.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from utils.find_ng_word import get_ng_wordlist, get_ng_wordlist_from_saved, search_ng_word
|
| 4 |
+
from utils.llm import load_llm_from_pretrained, inference
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
wordlist_1_path_s = "/content/drive/MyDrive/llm_qlora_ngword/ng_wordlists/ng_wordlist_1_sexual.txt"
|
| 8 |
+
wordlist_2_path_s = "/content/drive/MyDrive/llm_qlora_ngword/ng_wordlists/ng_wordlist_2_sexual.txt"
|
| 9 |
+
wordlist_1_path_o = "/content/drive/MyDrive/llm_qlora_ngword/ng_wordlists/ng_wordlist_1_offensive.txt"
|
| 10 |
+
wordlist_2_path_o = "/content/drive/MyDrive/llm_qlora_ngword/ng_wordlists/ng_wordlist_2_offensive.txt"
|
| 11 |
+
|
| 12 |
+
pretrained_model_path = "input/llm_weights"
|
| 13 |
+
|
| 14 |
+
print("モデルをロード")
|
| 15 |
+
ng_wordlist_1_s, ng_wordlist_2_s = get_ng_wordlist_from_saved(wordlist_1_path_s, wordlist_2_path_s)
|
| 16 |
+
ng_wordlist_1_o, ng_wordlist_2_o = get_ng_wordlist_from_saved(wordlist_1_path_o, wordlist_2_path_o)
|
| 17 |
+
|
| 18 |
+
model, tokenizer = load_llm_from_pretrained(pretrained_model_path)
|
| 19 |
+
|
| 20 |
+
# 検出結果を生成
|
| 21 |
+
def detect_ng_word(input_text):
|
| 22 |
+
response = []
|
| 23 |
+
rtn_s = search_ng_word(data_point["input"], ng_wordlist_1_s, ng_wordlist_2_s)
|
| 24 |
+
rtn_o = search_ng_word(data_point["input"], ng_wordlist_1_o, ng_wordlist_2_o)
|
| 25 |
+
rtn = rtn_s + rtn_o
|
| 26 |
+
|
| 27 |
+
if len(rtn) == 0:
|
| 28 |
+
response.append("NGワードは検知されませんでした \n")
|
| 29 |
+
else:
|
| 30 |
+
response.append('以下のNGワードを検知しました \n')
|
| 31 |
+
for rtn_i in rtn:
|
| 32 |
+
ng_word = str(rtn_i) + " \n"
|
| 33 |
+
response.append(ng_word)
|
| 34 |
+
|
| 35 |
+
rtn_s = [ri + "(sexual)" for ri in rtn_s]
|
| 36 |
+
rtn_o = [ri + "(offensive)" for ri in rtn_o]
|
| 37 |
+
ngword_with_label = rtn_s + rtn_o
|
| 38 |
+
|
| 39 |
+
output = inference(model, tokenizer, input_text, ngword_with_label)
|
| 40 |
+
|
| 41 |
+
if output == "はい。攻撃的だから。</s>":
|
| 42 |
+
response.append('不適切な内容を検知しました(攻撃的)')
|
| 43 |
+
elif output == "はい。暴力的だから。</s>":
|
| 44 |
+
response.append('不適切な内容を検知しました(暴力的)')
|
| 45 |
+
elif output == "はい。差別的だから。</s>":
|
| 46 |
+
response.append('不適切な内容を検知しました(差別的)')
|
| 47 |
+
elif output == "はい。性的だから。</s>":
|
| 48 |
+
response.append('不適切な内容を検知しました(性的)')
|
| 49 |
+
elif output == "はい。政治的だから。</s>":
|
| 50 |
+
response.append('不適切な内容を検知しました(政治的)')
|
| 51 |
+
else:
|
| 52 |
+
response.append("不適切な内容は検知されませんでした")
|
| 53 |
+
|
| 54 |
+
return response
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# 会話履歴用リスト型変数
|
| 58 |
+
message_history = []
|
| 59 |
+
|
| 60 |
+
def chat(user_msg):
|
| 61 |
+
"""
|
| 62 |
+
AIとの会話を実行後、全会話履歴を返す
|
| 63 |
+
user_msg: 入力されたユーザのメッセージ
|
| 64 |
+
"""
|
| 65 |
+
global message_history
|
| 66 |
+
|
| 67 |
+
# ユーザの会話を履歴に追加
|
| 68 |
+
message_history.append({
|
| 69 |
+
"role": "user",
|
| 70 |
+
"content": user_msg
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
# AIの回答を履歴に追加
|
| 74 |
+
response = detect_ng_word(user_msg)
|
| 75 |
+
assistant_msg = " ".join(response)
|
| 76 |
+
message_history.append({
|
| 77 |
+
"role": "assistant",
|
| 78 |
+
"content": assistant_msg
|
| 79 |
+
})
|
| 80 |
+
|
| 81 |
+
# 全会話履歴をChatbot用タプル・リストに変換して返す
|
| 82 |
+
return [(message_history[i]["content"], message_history[i+1]["content"]) for i in range(0, len(message_history)-1, 2)]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
with gr.Blocks() as demo:
|
| 86 |
+
# チャットボットUI処理
|
| 87 |
+
chatbot = gr.Chatbot()
|
| 88 |
+
input = gr.Textbox(show_label=False, placeholder="チェックしたい文章を入力してください")
|
| 89 |
+
input.submit(fn=chat, inputs=input, outputs=chatbot) # メッセージ送信されたら、AIと会話してチャット欄に全会話内容を表示
|
| 90 |
+
input.submit(fn=lambda: "", inputs=None, outputs=input) # (上記に加えて)入力欄をクリア
|
| 91 |
+
|
| 92 |
+
demo.launch()
|
input/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
input/images/detector.jpg
ADDED
|
input/images/user.jpg
ADDED
|
input/llm_weights/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
input/llm_weights/README.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: peft
|
| 3 |
+
---
|
| 4 |
+
## Training procedure
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
The following `bitsandbytes` quantization config was used during training:
|
| 8 |
+
- load_in_8bit: False
|
| 9 |
+
- load_in_4bit: True
|
| 10 |
+
- llm_int8_threshold: 6.0
|
| 11 |
+
- llm_int8_skip_modules: None
|
| 12 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
| 13 |
+
- llm_int8_has_fp16_weight: False
|
| 14 |
+
- bnb_4bit_quant_type: nf4
|
| 15 |
+
- bnb_4bit_use_double_quant: False
|
| 16 |
+
- bnb_4bit_compute_dtype: float16
|
| 17 |
+
### Framework versions
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- PEFT 0.4.0
|
input/llm_weights/adapter_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_mapping": null,
|
| 3 |
+
"base_model_name_or_path": "line-corporation/japanese-large-lm-3.6b",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"fan_in_fan_out": false,
|
| 6 |
+
"inference_mode": true,
|
| 7 |
+
"init_lora_weights": true,
|
| 8 |
+
"layers_pattern": null,
|
| 9 |
+
"layers_to_transform": null,
|
| 10 |
+
"lora_alpha": 16,
|
| 11 |
+
"lora_dropout": 0.1,
|
| 12 |
+
"modules_to_save": null,
|
| 13 |
+
"peft_type": "LORA",
|
| 14 |
+
"r": 64,
|
| 15 |
+
"revision": null,
|
| 16 |
+
"target_modules": [
|
| 17 |
+
"dense_4h_to_h",
|
| 18 |
+
"dense",
|
| 19 |
+
"dense_h_to_4h",
|
| 20 |
+
"query_key_value"
|
| 21 |
+
],
|
| 22 |
+
"task_type": "CAUSAL_LM"
|
| 23 |
+
}
|
input/llm_weights/runs/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
input/llm_weights/runs/Feb12_11-51-08_c9d9b7430adb/events.out.tfevents.1707738699.c9d9b7430adb.2957.1
ADDED
|
Binary file (9.97 kB). View file
|
|
|
input/ng_wordlists/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
input/ng_wordlists/ng_wordlist_1_offensive.txt
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ASSHOLE
|
| 2 |
+
BITCH
|
| 3 |
+
FUCK
|
| 4 |
+
GODDAMN
|
| 5 |
+
GYPSY
|
| 6 |
+
INDIAN
|
| 7 |
+
JAP
|
| 8 |
+
MOTHERFUCKER
|
| 9 |
+
NIGGER
|
| 10 |
+
SHIT
|
| 11 |
+
asshole
|
| 12 |
+
bitch
|
| 13 |
+
fuck
|
| 14 |
+
goddamn
|
| 15 |
+
gypsy
|
| 16 |
+
indian
|
| 17 |
+
jap
|
| 18 |
+
motherfucker
|
| 19 |
+
nigger
|
| 20 |
+
shit
|
| 21 |
+
あほ
|
| 22 |
+
いざり
|
| 23 |
+
かす
|
| 24 |
+
きちがい
|
| 25 |
+
ぎっちょ
|
| 26 |
+
くそ
|
| 27 |
+
じじい
|
| 28 |
+
つんぼ
|
| 29 |
+
でぶ
|
| 30 |
+
でべそ
|
| 31 |
+
はげ
|
| 32 |
+
ばか
|
| 33 |
+
ばばあ
|
| 34 |
+
びっこ
|
| 35 |
+
ぶす
|
| 36 |
+
ぼけ
|
| 37 |
+
まぬけ
|
| 38 |
+
めくら
|
| 39 |
+
アスペ
|
| 40 |
+
アホ
|
| 41 |
+
イザリ
|
| 42 |
+
カス
|
| 43 |
+
カタワ
|
| 44 |
+
ガイジ
|
| 45 |
+
キチガイ
|
| 46 |
+
ギッチョ
|
| 47 |
+
クソ
|
| 48 |
+
クソアマ
|
| 49 |
+
クソガキ
|
| 50 |
+
クソクラエ
|
| 51 |
+
クソゴミ
|
| 52 |
+
ジジイ
|
| 53 |
+
ジャップ
|
| 54 |
+
スチュワーデス
|
| 55 |
+
ステハゲ
|
| 56 |
+
ツンボ
|
| 57 |
+
デブ
|
| 58 |
+
デベソ
|
| 59 |
+
ナマポ
|
| 60 |
+
ネトウヨ
|
| 61 |
+
ハゲ
|
| 62 |
+
バカ
|
| 63 |
+
バカヤロウ
|
| 64 |
+
バカヤロー
|
| 65 |
+
ババア
|
| 66 |
+
パヨク
|
| 67 |
+
ビッコ
|
| 68 |
+
ビッチ
|
| 69 |
+
ピネガキ
|
| 70 |
+
ブサイク
|
| 71 |
+
ブス
|
| 72 |
+
ボケ
|
| 73 |
+
マザファッカー
|
| 74 |
+
マヌケ
|
| 75 |
+
メクラ
|
| 76 |
+
下女
|
| 77 |
+
下男
|
| 78 |
+
乞食
|
| 79 |
+
低脳
|
| 80 |
+
助産婦
|
| 81 |
+
唖
|
| 82 |
+
土人
|
| 83 |
+
土方
|
| 84 |
+
尻軽
|
| 85 |
+
屠殺
|
| 86 |
+
後進国
|
| 87 |
+
支那
|
| 88 |
+
気違い
|
| 89 |
+
池沼
|
| 90 |
+
狂人
|
| 91 |
+
狂女
|
| 92 |
+
畜生
|
| 93 |
+
白痴
|
| 94 |
+
盲人
|
| 95 |
+
看護婦
|
| 96 |
+
精神分裂病
|
| 97 |
+
糖質
|
| 98 |
+
老婆
|
| 99 |
+
肌色
|
| 100 |
+
農夫
|
| 101 |
+
醜男
|
| 102 |
+
非国民
|
| 103 |
+
黒んぼ
|
input/ng_wordlists/ng_wordlist_1_sexual.txt
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
COCKSUCKER
|
| 2 |
+
CUNT
|
| 3 |
+
DICK
|
| 4 |
+
NTR
|
| 5 |
+
SEX
|
| 6 |
+
SM
|
| 7 |
+
SOD
|
| 8 |
+
TENGA
|
| 9 |
+
TITS
|
| 10 |
+
cocksucker
|
| 11 |
+
cunt
|
| 12 |
+
dick
|
| 13 |
+
ntr
|
| 14 |
+
sex
|
| 15 |
+
sm
|
| 16 |
+
sod
|
| 17 |
+
tenga
|
| 18 |
+
tits
|
| 19 |
+
あげまん
|
| 20 |
+
いく
|
| 21 |
+
いやらしい
|
| 22 |
+
えっち
|
| 23 |
+
えろ
|
| 24 |
+
おっぱい
|
| 25 |
+
おぼこ
|
| 26 |
+
おめこ
|
| 27 |
+
きんたま
|
| 28 |
+
けつまんこ
|
| 29 |
+
さげまん
|
| 30 |
+
すけべ
|
| 31 |
+
ちんこ
|
| 32 |
+
ちんちん
|
| 33 |
+
ちんぽ
|
| 34 |
+
はめ撮り
|
| 35 |
+
ぱいずり
|
| 36 |
+
ふたなり
|
| 37 |
+
ぽこちん
|
| 38 |
+
まんこ
|
| 39 |
+
まんまん
|
| 40 |
+
むらむら
|
| 41 |
+
やりまん
|
| 42 |
+
アクメ
|
| 43 |
+
アゲマン
|
| 44 |
+
アナニー
|
| 45 |
+
アナル
|
| 46 |
+
アナルセックス
|
| 47 |
+
アヌス
|
| 48 |
+
イク
|
| 49 |
+
イチモツ
|
| 50 |
+
イチャラブセックス
|
| 51 |
+
イメクラ
|
| 52 |
+
イヤラシイ
|
| 53 |
+
イラマチオ
|
| 54 |
+
インポ
|
| 55 |
+
インポテンツ
|
| 56 |
+
エクスタシー
|
| 57 |
+
エッチ
|
| 58 |
+
エロ
|
| 59 |
+
エロい
|
| 60 |
+
エロイ
|
| 61 |
+
エロ本
|
| 62 |
+
オチンチン
|
| 63 |
+
オッパイ
|
| 64 |
+
オナニー
|
| 65 |
+
オナペ
|
| 66 |
+
オナペット
|
| 67 |
+
オナホ
|
| 68 |
+
オナホール
|
| 69 |
+
オネショタ
|
| 70 |
+
オボコ
|
| 71 |
+
オマンコ
|
| 72 |
+
オメコ
|
| 73 |
+
オーガズム
|
| 74 |
+
カウパー
|
| 75 |
+
キンタマ
|
| 76 |
+
クスコ
|
| 77 |
+
クソガキ
|
| 78 |
+
クリトリス
|
| 79 |
+
クンニ
|
| 80 |
+
クンニリングス
|
| 81 |
+
ケツマンコ
|
| 82 |
+
コンドーム
|
| 83 |
+
サゲマン
|
| 84 |
+
ザーメン
|
| 85 |
+
シコ
|
| 86 |
+
ショタオネ
|
| 87 |
+
スカトロ
|
| 88 |
+
スケベ
|
| 89 |
+
スペルマ
|
| 90 |
+
スワッピング
|
| 91 |
+
セックス
|
| 92 |
+
セフレ
|
| 93 |
+
センズリ
|
| 94 |
+
ソフト・オン・デマンド
|
| 95 |
+
ダイシュキホールド
|
| 96 |
+
チンコ
|
| 97 |
+
チンチン
|
| 98 |
+
チンポ
|
| 99 |
+
ディルド
|
| 100 |
+
デカチン
|
| 101 |
+
デリヘル
|
| 102 |
+
トルコ風呂
|
| 103 |
+
ナンパ
|
| 104 |
+
ノーパン
|
| 105 |
+
ハーレム
|
| 106 |
+
バイアグラ
|
| 107 |
+
バイブ
|
| 108 |
+
パイズリ
|
| 109 |
+
パイパン
|
| 110 |
+
パンチラ
|
| 111 |
+
ビッチ
|
| 112 |
+
フェラ
|
| 113 |
+
フェラチオ
|
| 114 |
+
フタナリ
|
| 115 |
+
ブルセラ
|
| 116 |
+
ペッティング
|
| 117 |
+
ペニバン
|
| 118 |
+
ポコチン
|
| 119 |
+
ポルチオ
|
| 120 |
+
マスターベーション
|
| 121 |
+
マンコ
|
| 122 |
+
マンマン
|
| 123 |
+
ムラムラ
|
| 124 |
+
ヤリチン
|
| 125 |
+
ヤリマン
|
| 126 |
+
ラブドール
|
| 127 |
+
ラブホ
|
| 128 |
+
ラブホテル
|
| 129 |
+
リフレ
|
| 130 |
+
レイプ
|
| 131 |
+
ロリコン
|
| 132 |
+
乱交
|
| 133 |
+
乳房
|
| 134 |
+
乳輪
|
| 135 |
+
乳首
|
| 136 |
+
亀頭
|
| 137 |
+
二穴
|
| 138 |
+
仮性包茎
|
| 139 |
+
体位
|
| 140 |
+
催眠
|
| 141 |
+
円光
|
| 142 |
+
処女
|
| 143 |
+
勃起
|
| 144 |
+
包茎
|
| 145 |
+
喘ぎ声
|
| 146 |
+
姦通
|
| 147 |
+
姫始め
|
| 148 |
+
媚薬
|
| 149 |
+
寝取り
|
| 150 |
+
射精
|
| 151 |
+
屍姦
|
| 152 |
+
巨乳
|
| 153 |
+
巨根
|
| 154 |
+
座位
|
| 155 |
+
強姦
|
| 156 |
+
後背位
|
| 157 |
+
微乳
|
| 158 |
+
性交
|
| 159 |
+
性感
|
| 160 |
+
性感帯
|
| 161 |
+
性欲
|
| 162 |
+
性行為
|
| 163 |
+
情夫
|
| 164 |
+
情婦
|
| 165 |
+
愛人
|
| 166 |
+
愛撫
|
| 167 |
+
愛液
|
| 168 |
+
手淫
|
| 169 |
+
援交
|
| 170 |
+
援助交際
|
| 171 |
+
放尿
|
| 172 |
+
早漏
|
| 173 |
+
正常位
|
| 174 |
+
泡姫
|
| 175 |
+
淫乱
|
| 176 |
+
淫行
|
| 177 |
+
淫靡
|
| 178 |
+
潮吹き
|
| 179 |
+
熟女
|
| 180 |
+
爆乳
|
| 181 |
+
獣姦
|
| 182 |
+
男娼
|
| 183 |
+
痴女
|
| 184 |
+
発情
|
| 185 |
+
睾丸
|
| 186 |
+
種付け
|
| 187 |
+
立ちんぼ
|
| 188 |
+
童貞
|
| 189 |
+
素股
|
| 190 |
+
素股
|
| 191 |
+
絶倫
|
| 192 |
+
緊縛
|
| 193 |
+
自慰
|
| 194 |
+
菊門
|
| 195 |
+
裏筋
|
| 196 |
+
視姦
|
| 197 |
+
貝合わせ
|
| 198 |
+
貧乳
|
| 199 |
+
輪姦
|
| 200 |
+
近親相姦
|
| 201 |
+
金玉
|
| 202 |
+
陰唇
|
| 203 |
+
陰嚢
|
| 204 |
+
陰核
|
| 205 |
+
陰毛
|
| 206 |
+
陰茎
|
| 207 |
+
陰部
|
| 208 |
+
陵辱
|
| 209 |
+
青姦
|
| 210 |
+
食糞
|
| 211 |
+
飲尿
|
| 212 |
+
騎乗位
|
| 213 |
+
黄金水
|
| 214 |
+
チンチン
|
input/ng_wordlists/ng_wordlist_2_offensive.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
F*CK
|
| 2 |
+
f*ck
|
| 3 |
+
あすぺ
|
| 4 |
+
かたわ
|
| 5 |
+
がいじ
|
| 6 |
+
くそあま
|
| 7 |
+
くそがき
|
| 8 |
+
くそくらえ
|
| 9 |
+
くそごみ
|
| 10 |
+
ごみ人間
|
| 11 |
+
じゃっぷ
|
| 12 |
+
すちゅわーです
|
| 13 |
+
すてはげ
|
| 14 |
+
なまぽ
|
| 15 |
+
ねとうよ
|
| 16 |
+
ばかやろう
|
| 17 |
+
ばかやろー
|
| 18 |
+
ぱよく
|
| 19 |
+
びっち
|
| 20 |
+
ぴねがき
|
| 21 |
+
ぶさいく
|
| 22 |
+
ぽり公
|
| 23 |
+
まざふぁっかー
|
| 24 |
+
クソくらえ
|
| 25 |
+
ゴミ人間
|
| 26 |
+
ポリ公
|
| 27 |
+
三国人
|
| 28 |
+
支那人
|
| 29 |
+
未開人
|
| 30 |
+
気違イ
|
| 31 |
+
知恵遅れ
|
| 32 |
+
知恵遅レ
|
| 33 |
+
精神異常
|
| 34 |
+
糞くらえ
|
| 35 |
+
糞クラエ
|
| 36 |
+
糞食らえ
|
| 37 |
+
糞食ラエ
|
| 38 |
+
統失
|
| 39 |
+
豚野郎
|
| 40 |
+
馬鹿野郎
|
| 41 |
+
黒ンボ
|
| 42 |
+
○ね
|
input/ng_wordlists/ng_wordlist_2_sexual.txt
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3P
|
| 2 |
+
3p
|
| 3 |
+
AV女優
|
| 4 |
+
Gすぽっと
|
| 5 |
+
Gスポット
|
| 6 |
+
Tばっく
|
| 7 |
+
Tバック
|
| 8 |
+
av女優
|
| 9 |
+
gスポット
|
| 10 |
+
tバック
|
| 11 |
+
あくめ
|
| 12 |
+
あだるとびでお
|
| 13 |
+
あなにー
|
| 14 |
+
あなる
|
| 15 |
+
あなるせっくす
|
| 16 |
+
あなるびーず
|
| 17 |
+
あなるぷらぐ
|
| 18 |
+
あなる拡張
|
| 19 |
+
あなる開発
|
| 20 |
+
あなるSEX
|
| 21 |
+
あぬす
|
| 22 |
+
あへ顔
|
| 23 |
+
いちもつ
|
| 24 |
+
いちゃいちゃせっくす
|
| 25 |
+
いちゃらぶせっくす
|
| 26 |
+
いめくら
|
| 27 |
+
いめーじびでお
|
| 28 |
+
いらまちお
|
| 29 |
+
いんぽ
|
| 30 |
+
いんぽてんつ
|
| 31 |
+
えくすたしー
|
| 32 |
+
えろい
|
| 33 |
+
えろ同人
|
| 34 |
+
えろ同人誌
|
| 35 |
+
えろ本
|
| 36 |
+
おちんちん
|
| 37 |
+
おっπ
|
| 38 |
+
おなにー
|
| 39 |
+
おなぺ
|
| 40 |
+
おなぺっと
|
| 41 |
+
おなほ
|
| 42 |
+
おなほーる
|
| 43 |
+
おねしょた
|
| 44 |
+
おねショタ
|
| 45 |
+
おまんこ
|
| 46 |
+
おーがずむ
|
| 47 |
+
お掃除ふぇら
|
| 48 |
+
お掃除フェラ
|
| 49 |
+
かうぱー
|
| 50 |
+
かんとん包茎
|
| 51 |
+
ぎゃぐぼーる
|
| 52 |
+
くすこ
|
| 53 |
+
くそがき
|
| 54 |
+
くりとりす
|
| 55 |
+
くんに
|
| 56 |
+
くんにりんぐす
|
| 57 |
+
こんどーむ
|
| 58 |
+
さかさ椋鳥
|
| 59 |
+
ざーめん
|
| 60 |
+
しっくすないん
|
| 61 |
+
しぼり芙蓉
|
| 62 |
+
しょたおね
|
| 63 |
+
すかとろ
|
| 64 |
+
すかるふぁっく
|
| 65 |
+
すけべ椅子
|
| 66 |
+
すぺるま
|
| 67 |
+
すわっぴんぐ
|
| 68 |
+
せきれい本手
|
| 69 |
+
せっくす
|
| 70 |
+
せふれ
|
| 71 |
+
せんずり
|
| 72 |
+
そふと・おん・でまんど
|
| 73 |
+
そーぷらんど
|
| 74 |
+
そーぷ嬢
|
| 75 |
+
だいしゅきほーるど
|
| 76 |
+
だいしゅきホールド
|
| 77 |
+
だっちわいふ
|
| 78 |
+
だぶるぴーす
|
| 79 |
+
ち○こ
|
| 80 |
+
でぃるど
|
| 81 |
+
でぃーぷすろーと
|
| 82 |
+
でかちん
|
| 83 |
+
でりばりーへるす
|
| 84 |
+
でりへる
|
| 85 |
+
とるこ風呂
|
| 86 |
+
とろ顔
|
| 87 |
+
なんぱ
|
| 88 |
+
のーぱん
|
| 89 |
+
はーれむ
|
| 90 |
+
ばいあぐら
|
| 91 |
+
ばいぶ
|
| 92 |
+
ばきゅーむふぇら
|
| 93 |
+
ぱいぱん
|
| 94 |
+
ぱぱ活
|
| 95 |
+
ぱんちら
|
| 96 |
+
ひとりえっち
|
| 97 |
+
びっち
|
| 98 |
+
ふぃすとふぁっく
|
| 99 |
+
ふぇら
|
| 100 |
+
ふぇらちお
|
| 101 |
+
ふぇら抜き
|
| 102 |
+
ぶるせら
|
| 103 |
+
ぺってぃんぐ
|
| 104 |
+
ぺにばん
|
| 105 |
+
ほ別
|
| 106 |
+
ぼて腹
|
| 107 |
+
ぽるちお
|
| 108 |
+
ま○こ
|
| 109 |
+
ますたーべーしょん
|
| 110 |
+
まんぐり返し
|
| 111 |
+
まん拓
|
| 112 |
+
やりちん
|
| 113 |
+
らぶどーる
|
| 114 |
+
らぶほ
|
| 115 |
+
らぶほてる
|
| 116 |
+
りふれ
|
| 117 |
+
れいぷ
|
| 118 |
+
ろりこん
|
| 119 |
+
アダルトビデオ
|
| 120 |
+
アナルビーズ
|
| 121 |
+
アナルプラグ
|
| 122 |
+
アナル拡張
|
| 123 |
+
アナル開発
|
| 124 |
+
アナルSEX
|
| 125 |
+
アナルsex
|
| 126 |
+
アヘ顔
|
| 127 |
+
イチャイチャセックス
|
| 128 |
+
イメージビデオ
|
| 129 |
+
エロ同人
|
| 130 |
+
エロ同人誌
|
| 131 |
+
オッΠ
|
| 132 |
+
オッπ
|
| 133 |
+
オ掃除フェラ
|
| 134 |
+
カントン包茎
|
| 135 |
+
ギャグボール
|
| 136 |
+
サカサ椋鳥
|
| 137 |
+
シックスナイン
|
| 138 |
+
シボリ芙蓉
|
| 139 |
+
ショタおね
|
| 140 |
+
スカルファック
|
| 141 |
+
スケベ椅子
|
| 142 |
+
セキレイ本手
|
| 143 |
+
ソープランド
|
| 144 |
+
ソープ嬢
|
| 145 |
+
ダッチワイフ
|
| 146 |
+
ダブルピース
|
| 147 |
+
チ○コ
|
| 148 |
+
ディープスロート
|
| 149 |
+
デリバリーヘルス
|
| 150 |
+
トロ顔
|
| 151 |
+
ハメ撮り
|
| 152 |
+
ハメ撮リ
|
| 153 |
+
バキュームフェラ
|
| 154 |
+
パパ活
|
| 155 |
+
ヒトリエッチ
|
| 156 |
+
フィストファック
|
| 157 |
+
フェラ抜き
|
| 158 |
+
フェラ抜キ
|
| 159 |
+
ホ別
|
| 160 |
+
ボテ腹
|
| 161 |
+
マ○コ
|
| 162 |
+
マングリ返シ
|
| 163 |
+
マン拓
|
| 164 |
+
一人H
|
| 165 |
+
一人h
|
| 166 |
+
中出し
|
| 167 |
+
中出シ
|
| 168 |
+
乙Π
|
| 169 |
+
乙π
|
| 170 |
+
乱れ牡丹
|
| 171 |
+
乱レ牡丹
|
| 172 |
+
亀甲縛り
|
| 173 |
+
亀甲縛リ
|
| 174 |
+
二穴同時
|
| 175 |
+
個人撮影
|
| 176 |
+
兜合わせ
|
| 177 |
+
兜合ワセ
|
| 178 |
+
入船本手
|
| 179 |
+
口内射精
|
| 180 |
+
口内発射
|
| 181 |
+
唐草居茶臼
|
| 182 |
+
喘ギ声
|
| 183 |
+
四十八手
|
| 184 |
+
太ももこき
|
| 185 |
+
太ももコキ
|
| 186 |
+
太モモコキ
|
| 187 |
+
姫始メ
|
| 188 |
+
孕ませ
|
| 189 |
+
孕マセ
|
| 190 |
+
寝取られ
|
| 191 |
+
寝取ラレ
|
| 192 |
+
寝取リ
|
| 193 |
+
寿本手
|
| 194 |
+
巨尻
|
| 195 |
+
帆かけ茶臼
|
| 196 |
+
帆カケ茶臼
|
| 197 |
+
忍び居茶臼
|
| 198 |
+
忍ビ居茶臼
|
| 199 |
+
快楽堕ち
|
| 200 |
+
快楽堕チ
|
| 201 |
+
性処理
|
| 202 |
+
性奴隷
|
| 203 |
+
性感まっさーじ
|
| 204 |
+
性感マッサージ
|
| 205 |
+
成人向け
|
| 206 |
+
成人向ケ
|
| 207 |
+
我慢汁
|
| 208 |
+
手こき
|
| 209 |
+
手まん
|
| 210 |
+
手コキ
|
| 211 |
+
手マン
|
| 212 |
+
抱き地蔵
|
| 213 |
+
抱キ地蔵
|
| 214 |
+
揚羽本手
|
| 215 |
+
放置ぷれい
|
| 216 |
+
放置プレイ
|
| 217 |
+
時雨茶臼
|
| 218 |
+
月見茶臼
|
| 219 |
+
朝勃ち
|
| 220 |
+
朝勃チ
|
| 221 |
+
朝起ち
|
| 222 |
+
朝起チ
|
| 223 |
+
松葉崩し
|
| 224 |
+
松葉崩シ
|
| 225 |
+
機織茶臼
|
| 226 |
+
汁男優
|
| 227 |
+
洞入り本手
|
| 228 |
+
洞入リ本手
|
| 229 |
+
淫語
|
| 230 |
+
潮吹キ
|
| 231 |
+
玉舐め
|
| 232 |
+
玉舐メ
|
| 233 |
+
生はめ
|
| 234 |
+
生ハメ
|
| 235 |
+
真性包茎
|
| 236 |
+
睡姦
|
| 237 |
+
種付けぷれす
|
| 238 |
+
種付けプレス
|
| 239 |
+
種付ケ
|
| 240 |
+
種付ケプレス
|
| 241 |
+
穴兄弟
|
| 242 |
+
立チンボ
|
| 243 |
+
笠舟本手
|
| 244 |
+
筆おろし
|
| 245 |
+
筆オロシ
|
| 246 |
+
筏本手
|
| 247 |
+
粗ちん
|
| 248 |
+
粗チン
|
| 249 |
+
網代本手
|
| 250 |
+
肉便器
|
| 251 |
+
胸ちら
|
| 252 |
+
胸チラ
|
| 253 |
+
脇こき
|
| 254 |
+
脇コキ
|
| 255 |
+
蟻の戸渡り
|
| 256 |
+
蟻ノ戸渡リ
|
| 257 |
+
貝合ワセ
|
| 258 |
+
足こき
|
| 259 |
+
足コキ
|
| 260 |
+
逆あなる
|
| 261 |
+
逆れいぷ
|
| 262 |
+
逆アナル
|
| 263 |
+
逆レイプ
|
| 264 |
+
遅漏
|
| 265 |
+
雁が首
|
| 266 |
+
雁ガ首
|
| 267 |
+
電ま
|
| 268 |
+
電マ
|
| 269 |
+
顔射
|
| 270 |
+
顔面騎乗
|
| 271 |
+
首引き恋慕
|
| 272 |
+
首引キ恋慕
|
| 273 |
+
鶯の谷渡り
|
| 274 |
+
鶯ノ谷渡リ
|
| 275 |
+
黒ぎゃる
|
| 276 |
+
黒ギャル
|
| 277 |
+
SMぷれい
|
| 278 |
+
SMプレイ
|
| 279 |
+
smプレイ
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate==0.21.0
|
| 2 |
+
peft==0.4.0
|
| 3 |
+
bitsandbytes==0.40.2
|
| 4 |
+
transformers==4.31.0
|
| 5 |
+
trl==0.4.7
|
| 6 |
+
sentencepiece
|
| 7 |
+
ginza
|
| 8 |
+
ja-ginza
|
| 9 |
+
jaconv
|
| 10 |
+
gradio
|
| 11 |
+
scipy
|
utils/find_ng_word.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import jaconv
|
| 3 |
+
import spacy
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
# 文字列としての検索
|
| 7 |
+
def find_string(text, wordlist):
|
| 8 |
+
rtn = []
|
| 9 |
+
for word in wordlist:
|
| 10 |
+
if re.search(word, text):
|
| 11 |
+
rtn.append(word)
|
| 12 |
+
return rtn
|
| 13 |
+
|
| 14 |
+
# 単語としての検索
|
| 15 |
+
def find_word(text, wordlist):
|
| 16 |
+
nlp = spacy.load("ja_ginza")
|
| 17 |
+
doc = nlp(text)
|
| 18 |
+
rtn = []
|
| 19 |
+
for token in doc:
|
| 20 |
+
if token.lemma_ in wordlist:
|
| 21 |
+
rtn.append(str(token))
|
| 22 |
+
return rtn
|
| 23 |
+
|
| 24 |
+
# 「1単語からなるもの」は単語として検索、「2単語以上からなるもの」は文字列として検索
|
| 25 |
+
def search_ng_word(input_text, ng_wordlist_1, ng_wordlist_2):
|
| 26 |
+
rtn = find_word(input_text, ng_wordlist_1) + find_string(input_text, ng_wordlist_2)
|
| 27 |
+
rtn = list(set(rtn))
|
| 28 |
+
return rtn
|
| 29 |
+
|
| 30 |
+
def get_ng_wordlist(wordlist_path, discrepancies=False):
|
| 31 |
+
with io.open(wordlist_path, "r", encoding="utf-8") as f:
|
| 32 |
+
ng_wordlist = f.read().split("\n")
|
| 33 |
+
ng_wordlist = [word for word in ng_wordlist if len(word) > 0]
|
| 34 |
+
|
| 35 |
+
# 表記揺れに対応するためひらがな版とカタカナ版を用意
|
| 36 |
+
if discrepancies:
|
| 37 |
+
l = len(ng_wordlist)
|
| 38 |
+
for i in range(l):
|
| 39 |
+
ng_wordlist.append(jaconv.kata2hira(ng_wordlist[i]))
|
| 40 |
+
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]))
|
| 41 |
+
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).lower())
|
| 42 |
+
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).upper())
|
| 43 |
+
ng_wordlist = list(set(ng_wordlist))
|
| 44 |
+
|
| 45 |
+
# NGワードを「1単語からなるもの」と「2単語以上からなるもの」に分類
|
| 46 |
+
nlp = spacy.load("ja_ginza")
|
| 47 |
+
ng_wordlist_1 = []
|
| 48 |
+
ng_wordlist_2 = []
|
| 49 |
+
|
| 50 |
+
for word in ng_wordlist:
|
| 51 |
+
doc = nlp(word)
|
| 52 |
+
if len(doc) == 1:
|
| 53 |
+
ng_wordlist_1.append(word)
|
| 54 |
+
elif len(doc) >= 2:
|
| 55 |
+
ng_wordlist_2.append(word)
|
| 56 |
+
|
| 57 |
+
return ng_wordlist_1, ng_wordlist_2
|
| 58 |
+
|
| 59 |
+
def get_ng_wordlist_from_saved(wordlist_1_path, wordlist_2_path):
|
| 60 |
+
|
| 61 |
+
with io.open(wordlist_1_path, "r", encoding="utf-8") as f:
|
| 62 |
+
ng_wordlist_1 = f.read().split("\n")
|
| 63 |
+
ng_wordlist_1 = [word for word in ng_wordlist_1 if len(word) > 0]
|
| 64 |
+
|
| 65 |
+
with io.open(wordlist_2_path, "r", encoding="utf-8") as f:
|
| 66 |
+
ng_wordlist_2 = f.read().split("\n")
|
| 67 |
+
ng_wordlist_2 = [word for word in ng_wordlist_2 if len(word) > 0]
|
| 68 |
+
|
| 69 |
+
return ng_wordlist_1, ng_wordlist_2
|
utils/llm.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from transformers import (
|
| 5 |
+
AutoModelForCausalLM,
|
| 6 |
+
AutoTokenizer,
|
| 7 |
+
BitsAndBytesConfig,
|
| 8 |
+
HfArgumentParser,
|
| 9 |
+
TrainingArguments,
|
| 10 |
+
pipeline,
|
| 11 |
+
logging,
|
| 12 |
+
)
|
| 13 |
+
from peft import LoraConfig, PeftModel
|
| 14 |
+
from trl import SFTTrainer
|
| 15 |
+
|
| 16 |
+
def load_llm_from_pretrained(pretrained_model_path):
|
| 17 |
+
|
| 18 |
+
if torch.cuda.is_available():
|
| 19 |
+
device_map = {"": 0}
|
| 20 |
+
else:
|
| 21 |
+
device_map = {"": "cpu"}
|
| 22 |
+
|
| 23 |
+
# モデルの準備
|
| 24 |
+
model_name = "line-corporation/japanese-large-lm-3.6b"
|
| 25 |
+
|
| 26 |
+
# トークナイザーの準備
|
| 27 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 28 |
+
model_name,
|
| 29 |
+
use_fast=False,
|
| 30 |
+
add_eos_token=True,
|
| 31 |
+
trust_remote_code=True
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# モデルの準備
|
| 35 |
+
if torch.cuda.is_available():
|
| 36 |
+
# 量子化パラメータ
|
| 37 |
+
bnb_config = BitsAndBytesConfig(
|
| 38 |
+
load_in_4bit=True, # 4bitベースモデルの有効化
|
| 39 |
+
bnb_4bit_quant_type="nf4", # 量子化種別 (fp4 or nf4)
|
| 40 |
+
bnb_4bit_compute_dtype=torch.float16, # 4bitベースモデルのdtype (float16 or bfloat16)
|
| 41 |
+
bnb_4bit_use_double_quant=False, # 4bitベースモデルのネストされた量子化の有効化 (二重量子化)
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 45 |
+
model_name,
|
| 46 |
+
quantization_config=bnb_config,
|
| 47 |
+
device_map=device_map,
|
| 48 |
+
torch_dtype=torch.float16
|
| 49 |
+
)
|
| 50 |
+
else:
|
| 51 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 52 |
+
model_name,
|
| 53 |
+
device_map=device_map,
|
| 54 |
+
torch_dtype=torch.float32
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
model = PeftModel.from_pretrained(
|
| 59 |
+
base_model,
|
| 60 |
+
pretrained_model_path,
|
| 61 |
+
offload_folder='./'
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
return model, tokenizer
|
| 65 |
+
|
| 66 |
+
def inference(model, tokenizer, input_text, ngword_with_label, temperature=0.01):
|
| 67 |
+
# プロンプトの準備
|
| 68 |
+
prompt = f"""指示:
|
| 69 |
+
入力された文章が不適切(攻撃的、暴力的、差別的、性的、政治的)であるかどうか、「はい」か「いいえ」で回答してください。「はい」の場合は理由も添えてください。
|
| 70 |
+
{input_text}
|
| 71 |
+
NGワード:
|
| 72 |
+
{", ".join(ngword_with_label)}
|
| 73 |
+
応答:
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
# 推論の実行
|
| 77 |
+
input_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
|
| 78 |
+
output_ids = model.generate(
|
| 79 |
+
input_ids=input_ids.to(device=model.device),
|
| 80 |
+
max_length=100,
|
| 81 |
+
temperature=temperature,
|
| 82 |
+
do_sample=True,
|
| 83 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 84 |
+
bos_token_id=tokenizer.bos_token_id,
|
| 85 |
+
eos_token_id=tokenizer.eos_token_id
|
| 86 |
+
)
|
| 87 |
+
output = tokenizer.decode(output_ids.tolist()[0][input_ids.size(1):])
|
| 88 |
+
|
| 89 |
+
return output
|