Spaces:

awarefy
/

amp

Sleeping

App Files Files Community

terada/init-package

by terapyon - opened Apr 19, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+14

-529

Files changed (12) hide show

.gitattributes +0 -2
.gitignore +0 -160
README.md +0 -37
app.py +14 -26
inference.py +0 -223
requirements-dev.txt +0 -1
requirements.txt +0 -13
saved_model/stop_words/Japanese_selection.txt +0 -3
saved_model/topic/trained_model.bin +0 -3
tests/__pycache__/test_app.cpython-311-pytest-8.1.1.pyc +0 -0
tests/test_app.py +0 -30
visualization.py +0 -31

.gitattributes CHANGED Viewed

@@ -33,5 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-saved_model/stop_words/Japanese_selection.txt filter=lfs diff=lfs merge=lfs -text
-saved_model/topic/trained_model.bin filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,160 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

README.md CHANGED Viewed

@@ -17,40 +17,3 @@ license: unknown
 - Python 3.11
 - Streamlit 1.33
-### 仮想環境
-venvを用いてインストールを行います。
-venvは、Pythonの標準ライブラリです。
-https://docs.python.org/ja/3/tutorial/venv.html
-```sh
-% cd (任意のフォルダ)
-% python3 -m venv venv
-% source venv/bin/activate
-```
-### インストール
-GitHubからパッケージをダウンロードしてインストール
-```sh
-(venv) % git clone https://github.com/awarefy/amp.git
-(venv) % cd amp
-(venv) % pip install -r requirements.txt -c constraints.txt
-```
-## 起動方法
-```
-(venv) % streamlit run app.py
-```
-## 表示確認
-起動すると、デフォルトブラウザが立ち上がり表示確認ができる。
-もし、ブラウザが立ち上がらない場合は、コンソールに表示されるポート付URLをブラウザで呼び出す。


17	- Python 3.11
18	- Streamlit 1.33
19

app.py CHANGED Viewed

@@ -1,18 +1,16 @@
-import streamlit as st
-from inference import classify_ma, get_word_attn, infer_topic
-from visualization import heatmap, html_hext
 ID2CAT = {
-    0: "マイクロアグレッションではない可能性が高い",
-    1: "マイクロアグレッションの可能性が高い",
 }
 explanation_text = """
-このマイクロアグレッションチェッカーは、機械学習（AI技術のようなもの）によって、マイクロアグレッションらしい表現を検出できるように設計されています。
 """
 attention_text = """
-【結果を見る際の注意点】
-この技術は「文中にマイクロアグレッションに結びつく要素が含まれているかどうか」を判定するモデルであり、
 必ずしも「この文章の書き手がマイクロアグレッションをしている」ことを明確に示すものではありません。
 判定結果を元に、改めて人間同士で「なぜ/どのようにしてマイクロアグレッションたりうるか」議論をするために利用してください。
@@ -22,31 +20,21 @@ provide_by = """提供元: オールマイノリティプロジェクト
 [https://all-minorities.com/](https://all-minorities.com/)
 """
 st.title("マイクロアグレッション判別モデル")
 st.markdown(explanation_text)
 user_input = st.text_input("文章を入力してください:", key="user_input")
-if st.button("判定", key="run"):
     if not user_input:
-        st.warning("入力が空です。何か入力してください。")
     else:
-        pred_class, input_ids, attention_list = classify_ma(user_input)
-        st.markdown(f"判定結果: **{ID2CAT[pred_class]}**")
-        if pred_class == 1:
-            topic_dist, ll = infer_topic(user_input)
-            words_atten = get_word_attn(input_ids, attention_list)
-            html_hext_result = html_hext(((word, attn) for word, attn in words_atten))
-            st.markdown(html_hext_result, unsafe_allow_html=True)
-            data = topic_dist.reshape(-1, 1)
-            st.plotly_chart(heatmap(data), use_container_width=True)
-        st.divider()
         st.markdown(attention_text)
-st.divider()
-st.markdown(provide_by)

+import random
+import streamlit as st
 ID2CAT = {
+    0: "マイクロアグレッションではない",
+    1: "マイクロアグレッションである",
 }
 explanation_text = """
+このマイクロアグレッションチェッカーは、機械学習（AI技術のようなもの）によって、マイクロアグレッションらしい言語を検出できるように設計されています。
 """
 attention_text = """
+この技術は「文中にマイクロアグレッションに結びつく要素が含まれているかどうか」を判定するモデルになっています。
 必ずしも「この文章の書き手がマイクロアグレッションをしている」ことを明確に示すものではありません。
 判定結果を元に、改めて人間同士で「なぜ/どのようにしてマイクロアグレッションたりうるか」議論をするために利用してください。
 [https://all-minorities.com/](https://all-minorities.com/)
 """
 st.title("マイクロアグレッション判別モデル")
 st.markdown(explanation_text)
 user_input = st.text_input("文章を入力してください:", key="user_input")
+if st.button("判定"):
     if not user_input:
+        st.write("入力が空です。何か入力してください。")
     else:
         st.markdown(attention_text)
+        st.divider()
+        random_id = random.randint(0, 1)
+        st.markdown(f"判定結果: **{ID2CAT[random_id]}**")
+        st.divider()
+st.markdown(provide_by)

inference.py DELETED Viewed

@@ -1,223 +0,0 @@
-import os
-import re
-from pathlib import Path
-from typing import Generator
-from unicodedata import normalize
-import numpy as np
-import streamlit as st
-import tomotopy as tp  # type: ignore
-import torch
-import torch.nn as nn
-import transformers as T  # type: ignore
-from huggingface_hub import PyTorchModelHubMixin  # type: ignore
-from scipy import stats  # type: ignore
-from sudachipy import dictionary, tokenizer  # type: ignore
-HF_AUTH_TOKEN = os.getenv("HF_AUTH_TOKEN")
-MODELS_PATH = Path(__file__).parent / "saved_model"
-# model_base_path = MODELS_PATH / "two_class"
-MODEL_BASE = "awarefy/awarefy-two_class-trained-"
-topic_model_trained = MODELS_PATH / "topic" / "trained_model.bin"
-japanese_selection_path = MODELS_PATH / "stop_words" / "Japanese_selection.txt"
-# GPUの指定
-if torch.cuda.is_available():
-    gpu = 0
-    # gpu = -1  # For debugging
-else:
-    gpu = -1  # gpu = -1  # GPUが使用できなければ(CPUで処理)-1を指定
-# cls_num = 3
-max_length = 512
-k_folds = 10
-bert_model_name = "cl-tohoku/bert-base-japanese-v3"
-device = torch.device(f"cuda:{gpu}" if gpu>=0 else "cpu")
-#BERTモデルの定義
-class BertClassifier(nn.Module, PyTorchModelHubMixin):
-    def __init__(self, cls_num: int):
-        super().__init__()
-        self.bert = T.BertModel.from_pretrained(bert_model_name, output_attentions=True)
-        self.fc = nn.Linear(768, cls_num, bias=True)
-        nn.init.normal_(self.fc.weight, std=0.02)
-        nn.init.normal_(self.fc.bias, 0)
-    def forward(self, input_ids, masks):
-        result = self.bert(input_ids, masks)
-        vec = result[0]
-        _ = result[1]
-        attentions = result[2]
-        vec = vec[:, 0, :]
-        vec = vec.view(-1, 768)
-        output = self.fc(vec)
-        return output, _, attentions
-#日本語Stopwords除去関数
-def load_stopwords() -> set[str]:
-    with open(japanese_selection_path, "r", encoding="utf-8") as f:
-        # stopwords = [w.strip() for w in f]
-        # stopwords = set(stopwords)
-        stopwords = {w.strip() for w in f if w.strip()}
-    return stopwords
-class SudachiTokenizer:
-    def __init__(self, split_mode="C"):
-        self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
-        self.stopwords = load_stopwords()
-        if split_mode == "A":
-            self.mode = tokenizer.Tokenizer.SplitMode.C
-        elif split_mode == "B":
-            self.mode = tokenizer.Tokenizer.SplitMode.B
-        else:
-            self.mode = tokenizer.Tokenizer.SplitMode.C
-        # ひらがなのみの文字列にマッチする正規表現
-        self.kana_re = re.compile("^[ぁ-ゖ]+$")
-        #Stopwords
-        self.stopwords = load_stopwords()
-    def get_wakati(self, text: str) -> list[str]:
-        wakati_list = []
-        normalized_wakati_list = []
-        pos_list = []
-        normalized_text = normalize("NFKC", text)
-        tmp = re.sub(r'[0-9]','',normalized_text)
-        tmp = re.sub(r'[０-９]', '', tmp)
-        tmp = re.sub(r'[、。：（）「」%『』（）？！％→＋｀.・×,〜～—+＝♪/!?]','',tmp)
-        tmp = re.sub(r'[a-zA-Z]','',tmp)
-        #絵文字除去
-        tmp = re.sub(r'[❓]', "", tmp)
-        for m in self.tokenizer_obj.tokenize(tmp, self.mode):
-            word = m.surface()
-            pos = m.part_of_speech()[0]
-            normalized_word = m.normalized_form()
-            wakati_list.append(word)
-            normalized_wakati_list.append(normalized_word)
-            pos_list.append(pos)
-        #名詞，動詞，形容詞のみに絞り込み
-        target_pos = ["名詞", "動詞", "形容詞"]
-        #target_pos = ["名詞", "形容詞"]
-        token_list = [t for t, p in zip(wakati_list, pos_list) if p in target_pos]
-        #アルファベットを小文字に統一
-        token_list = [t.lower() for t in token_list]
-        #ひらがなのみの単語を除く
-        #token_list = [t for t in token_list if not self.kana_re.match(t)]
-        #ストップワード除去
-        token_list = [t for t in token_list if t not in self.stopwords]
-        return token_list
-def make_traind_model():
-    trained_models = []
-    for k in range(k_folds):
-        k = k + 1
-        # model_path = model_base_path / f"trained_model{k}.pt"
-        # trained_model = copy.deepcopy(bert_model)
-        # trained_model.load_state_dict(torch.load(model_path, map_location=device), strict=False)
-        # trained_models.append(trained_model)
-        model_name = MODEL_BASE + str(k)
-        trained_model = BertClassifier.from_pretrained(model_name, token=HF_AUTH_TOKEN).to(device)
-        print(f"Got model {model_name}")
-        trained_models.append(trained_model)
-    return trained_models
-@st.cache_resource
-def init_models():
-    # bert_model = BertClassifier(cls_num=1) #出力ノードを1に設定
-    # bert_model.eval()
-    # bert_model.to(device)
-    tokenizer_sudachi = SudachiTokenizer(split_mode="C")
-    #Tokenizerの設定(���こではtokenizerをtokenizer_c2にしている)
-    tokenizer_c2 = T.BertJapaneseTokenizer.from_pretrained(bert_model_name)
-    # trained_models = make_traind_model(bert_model)
-    trained_models = make_traind_model()
-    return tokenizer_sudachi, tokenizer_c2, trained_models
-tokenizer_sudachi, tokenizer_c2, trained_models = init_models()
-# Attentionマップを算出する関数の定義
-def f_a(sentences: list[str], tokenizer_c2, model, device):
-    encoded = tokenizer_c2.batch_encode_plus(
-                sentences,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_attention_mask=True
-                )
-    input_ids = torch.tensor(encoded["input_ids"]).to(device)
-    attention_mask = torch.tensor(encoded["attention_mask"]).to(device)
-    with torch.no_grad():
-        outputs, _, attentions = model(input_ids, attention_mask)
-    #return input_ids.detach().cpu(), attentions[-1].detach().cpu()
-    return input_ids.detach().cpu(), attentions[-1].detach().cpu(), outputs.detach().cpu()
-def get_word_attn(input_ids, attention_weight) -> Generator[tuple[str, float], None, None]:
-    # 文章の長さ分のzero tensorを宣言
-    seq_len = attention_weight.size()[2]
-    all_attens = torch.zeros(seq_len)
-    # 12個のMulti Head Attentionの結果を全部足し合わせる
-    # 最初の0はinput_idsは１文章だけを想定しているため
-    # 次の0はCLSトークンのAttention結果を取得している、という意味です。
-    for i in range(12):
-        all_attens += attention_weight[0, i, 0, :]
-    for word, attn in zip(input_ids.flatten(), all_attens):
-        if tokenizer_c2.convert_ids_to_tokens(word.tolist()) == "[CLS]":
-            continue
-        if tokenizer_c2.convert_ids_to_tokens(word.tolist()) == "[SEP]":
-            break
-        converted_word = tokenizer_c2.convert_ids_to_tokens([word.numpy().tolist()])[0]
-        yield converted_word, attn
-def classify_ma(sentence: str) -> tuple[int, torch.Tensor, torch.Tensor]:
-    normalized_sentence = normalize("NFKC", sentence)
-    tmp = re.sub(r'[0-9]','',normalized_sentence)
-    tmp = re.sub(r'[０-９]', '', tmp)
-    tmp = re.sub(r'[、。：（）「」%『』（）？！％→＋｀.・×,〜～—+＝♪/!?]','',tmp)
-    tmp = re.sub(r'[a-zA-Z]','',tmp)
-    #絵文字除去
-    tmp = re.sub(r'[❓]', "", tmp)
-    attention_list, output_list = [], []
-    for trained_model in trained_models:
-        input_ids, attention, output = f_a([tmp], tokenizer_c2, trained_model, device)
-        attention_list.append(attention)
-        output_list.append(output)
-    #出力された10個の予測値の多数決を算出
-    outputs = np.concatenate(output_list)
-    prob_column = torch.sigmoid(torch.tensor(outputs))
-    pred_column = torch.ge(prob_column, 0.5).float()
-    ensemble_pred, count = stats.mode(pred_column)
-    #出力された10個のattention mapの平均値を算出
-    attentions = torch.concat(attention_list)
-    mean_attention = torch.mean(attentions, dim=0).unsqueeze(dim=0)
-    return ensemble_pred.item(), input_ids, mean_attention
-#モデルのロードとinferの関数化
-def infer_topic(new_text: str) -> tuple[np.ndarray, float]:
-    model_trained = tp.CTModel.load(str(topic_model_trained))
-    new_word_list = tokenizer_sudachi.get_wakati(new_text)
-    new_doc = model_trained.make_doc(new_word_list)
-    topic_dist, ll = model_trained.infer(new_doc)
-    return topic_dist, ll

requirements-dev.txt CHANGED Viewed

@@ -1,4 +1,3 @@
 -r requirements.txt
 ruff
 mypy
-pytest

 -r requirements.txt
 ruff
 mypy

requirements.txt CHANGED Viewed

@@ -1,14 +1 @@
 streamlit
-numpy
-pandas
-plotly
-transformers
-scipy
-torch
-fugashi
-unidic-lite
-sudachipy
-sudachidict_full
-sudachidict_core
-tomotopy


1	streamlit

saved_model/stop_words/Japanese_selection.txt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b9654e2f6f739a61285f80538e1076d938f54f090974d8f872ad59b246a66da8
-size 2202

saved_model/topic/trained_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:46c0cc05fcb664763839ca099f04aa5275a269cfbea8847f33214bc73affdcce
-size 695117

tests/__pycache__/test_app.cpython-311-pytest-8.1.1.pyc DELETED Viewed

Binary file (4.87 kB)

tests/test_app.py DELETED Viewed

@@ -1,30 +0,0 @@
-import sys
-from pathlib import Path
-import pytest
-from streamlit.testing.v1 import AppTest
-sys.path.append(str(Path(__file__).parent.parent))
-def test_text_no_input():
-    at = AppTest.from_file("app.py").run()
-    at.button[0].click().run()
-    assert at.warning[0].value == "入力が空です。何か入力してください。"
-def test_text_with_input():
-    at = AppTest.from_file("app.py").run()
-    # at.text_input[0].assert_exists()
-    at.text_input[0].input("test").run()
-    at.button[0].click().run()
-    assert "判定結果: **マイクロアグレッションで" in at.markdown[2].value
-@pytest.mark.skip(reason="まだ実装していないのでランダムに返ってくる")
-def test_aggression():
-    at = AppTest.from_file("app.py").run()
-    text = "サンプルの入力文字列NHKの番組を見ていると,発達障害者の才能を特集されることが多い。それを見ていると自分もそのような才能を期待されているように感じる"
-    at.text_input[0].input(text).run()
-    at.button[0].click().run()
-    assert "提供元: " not in at.markdown[3].value

visualization.py DELETED Viewed

@@ -1,31 +0,0 @@
-from typing import Iterable
-import numpy as np
-import plotly.express as px  # type: ignore
-def highlight(word: str, attn: float) -> str:
-    color = "#%02X%02X%02X" % (255, int(255 * (1 - attn)), int(255 * (1 - attn)))
-    return f'<span style="background-color: {color}">{word}</span>'
-def html_hext(words_attn: Iterable[tuple[str, float]]) -> str:
-    return " ".join(highlight(word, attn) for word, attn in words_attn)
-def heatmap(data: np.ndarray):
-    y_labels = [
-        "嘲笑や特性を理解されない",
-        "特性や能力への攻撃",
-        "学校や職場で受け入れられない",
-        "特性をおかしいとみなされる",
-        "障害への差別や苦悩をなかったことにされる",
-        "うまくコミュニケーションがとれない",
-        "障害について理解されない",
-        "侮蔑される，認められない",
-        "周囲の理解不足",
-        "障害をなかったことにされる，責められる",
-    ]
-    fig = px.imshow(data, labels=dict(x="判定", y="名称"), y=y_labels)
-    fig.update_layout(coloraxis_colorbar=dict(title="得点"))
-    return fig