commit files to HF hub

Files changed (10) hide show

.gitattributes +1 -0
config.json +27 -0
fasttext_fsc.py +56 -0
fasttext_jp_embedding.py +48 -0
fasttext_jp_tokenizer.py +143 -0
mecab_tokenizer.py +92 -0
pytorch_model.bin +3 -0
special_tokens_map.json +4 -0
tokenizer_config.json +12 -0
vocab.txt +3 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+vocab.txt filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "FastTextForSeuqenceClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "fasttext_jp_embedding.FastTextJpConfig",
+    "AutoModel": "fasttext_fsc.FastTextForSeuqenceClassification"
+  },
+  "hidden_size": 300,
+  "id2label": {
+    "0": "entailment",
+    "1": "neutral",
+    "2": "contradiction"
+  },
+  "label2id": {
+    "contradiction": 2,
+    "entailment": 0,
+    "neutral": 1
+  },
+  "max_length": 128,
+  "model_type": "fasttext_jp",
+  "tokenizerI_class": "FastTextJpTokenizer",
+  "tokenizer_class": "FastTextJpTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.23.1",
+  "vocab_size": 500
+}

fasttext_fsc.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from __future__ import annotations
+from transformers import PretrainedConfig
+from torch import nn
+import torch
+from torchtyping import TensorType
+from .fasttext_jp_embedding import FastTextJpModel, FastTextJpConfig
+from transformers.modeling_outputs import SequenceClassifierOutput
+class FastTextForSeuqenceClassification(FastTextJpModel):
+    """FastTextのベクトルをベースとした分類を行います。
+    """
+    def __init__(self, config: FastTextJpConfig):
+        super().__init__(config)
+    def forward(self, **inputs) -> SequenceClassifierOutput:
+        """embeddingを行います。
+        Returns:
+            TensorType["batch", "word", "vectors"]: 単語ごとにベクトルを返します。
+        """
+        input_ids = inputs["input_ids"]
+        outputs = self.word_embeddings(input_ids)
+        sentence = outputs[torch.logical_and(inputs["attention_mask"] == 1,
+                                             inputs["token_type_ids"] == 0)]
+        candidate_label = outputs[torch.logical_and(
+            inputs["attention_mask"] == 1, inputs["token_type_ids"] == 1)]
+        sentence_mean = torch.mean(sentence, dim=-2, keepdim=True)
+        candidate_label_mean = torch.mean(candidate_label,
+                                          dim=-2,
+                                          keepdim=True)
+        if sentence_mean.dim() == 2:
+            p = torch.nn.functional.cosine_similarity(sentence_mean,
+                                                      candidate_label_mean,
+                                                      dim=1)
+            logits = [[torch.log(p), -torch.inf, torch.log(1 - p)]]
+        else:
+            logits = []
+            # batch
+            for sm, clm in zip(sentence_mean, candidate_label_mean):
+                p = torch.nn.functional.cosine_similarity(sm, clm, dim=1)
+                logits.append([[torch.log(p), -torch.inf, torch.log(1 - p)]])
+        logits = torch.FloatTensor(logits)
+        return SequenceClassifierOutput(
+            loss=None,
+            logits=logits,  # type: ignore
+            hidden_states=None,
+            attentions=None,
+        )
+# AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
+# https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
+FastTextForSeuqenceClassification.register_for_auto_class("AutoModel")

fasttext_jp_embedding.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from __future__ import annotations
+from transformers import PretrainedConfig
+from transformers import PreTrainedModel
+from torch import nn
+import torch
+from torchtyping import TensorType
+class FastTextJpConfig(PretrainedConfig):
+    """FastTextJpModelのConfig
+    """
+    model_type = "fasttext_jp"
+    def __init__(self, tokenizer_class="FastTextJpTokenizer", **kwargs):
+        """初期化処理
+        Args:
+            tokenizer_class (str, optional):
+                tokenizer_classを指定しないと、pipelineから読み込まれません。
+                config.jsonに記載されます。
+        """
+        kwargs["tokenizer_class"] = tokenizer_class
+        super().__init__(**kwargs)
+class FastTextJpModel(PreTrainedModel):
+    """FastTextのEmbeddingを行います。
+    """
+    config_class = FastTextJpConfig
+    def __init__(self, config: FastTextJpConfig):
+        super().__init__(config)
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+    def forward(self, **inputs) -> TensorType["batch", "word", "vectors"]:
+        """embeddingを行います。
+        Returns:
+            TensorType["batch", "word", "vectors"]: 単語ごとにベクトルを返します。
+        """
+        return self.word_embeddings(torch.Tensor(inputs["input_ids"]))
+# AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
+# https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
+FastTextJpConfig.register_for_auto_class()
+FastTextJpModel.register_for_auto_class("AutoModel")

fasttext_jp_tokenizer.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from __future__ import annotations
+from .mecab_tokenizer import MeCabTokenizer
+import os
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+def save_stoi(stoi: dict[str, int], vocab_file: str):
+    """単語IDの辞書を配列にしてvocab_fileに保存します。
+    Args:
+        stoi (dict[str, int]): 単語IDのマッピング
+        vocab_file (str): 保存するパス
+    Raises:
+        ValueError: IDが途切れているとエラーを起こします。
+    """
+    with open(vocab_file, "w", encoding="utf-8") as writer:
+        index = 0
+        for token, token_index in sorted(stoi.items(), key=lambda kv: kv[1]):
+            if index != token_index:
+                raise ValueError(
+                    "Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                    " Please check that the vocabulary is not corrupted!")
+            writer.write(token + "\n")
+            index += 1
+def load_stoi(vocab_file: str) -> dict[str, int]:
+    """ファイルから単語IDの辞書をロードします。
+    Args:
+        vocab_file (str): ファイルのパス
+    Returns:
+        dict[str, int]: 単語IDのマッピング
+    """
+    stoi: dict[str, int] = {}
+    # ファイルから読み出し
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    # 単語IDのマッピングを生成します。
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        stoi[token] = index
+    return stoi
+class FastTextJpTokenizer(MeCabTokenizer):
+    # Configが認識するのに必要です。
+    # https://huggingface.co/docs/transformers/custom_models#writing-a-custom-configuration
+    model_type = "fasttext_jp"
+    # vocab.txtを認識するのにおそらく必要。
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(self,
+                 vocab_file: str,
+                 hinshi: list[str] | None = None,
+                 mecab_dicdir: str | None = None,
+                 **kwargs):
+        """初期化処理
+        Args:
+            vocab_file (str): vocab_fileのpath
+            hinshi (list[str] | None, optional): 抽出する品詞
+            mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
+        """
+        super().__init__(hinshi, mecab_dicdir, **kwargs)
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.stoi = load_stoi(vocab_file)
+        self.itos = dict([(ids, tok) for tok, ids in self.stoi.items()])
+    @property
+    def vocab_size(self) -> int:
+        """ボキャブラリのサイズ
+        ※PreTrainedTokenizerで実装すべき必須の関数。
+        Returns:
+            int: ボキャブラリのサイズ
+        """
+        return len(self.stoi)
+    def _convert_token_to_id(self, token: str) -> int:
+        """単語からID
+        ※PreTrainedTokenizerで実装すべき必須の関数。
+        Args:
+            token (str): 単語
+        Returns:
+            int: ID
+        """
+        return self.stoi[token]
+    def _convert_id_to_token(self, index: int) -> str:
+        """IDから単語
+        ※PreTrainedTokenizerで実装すべき必須の関数。
+        Args:
+            index (int): ID
+        Returns:
+            str: 単語
+        """
+        return self.itos[index]
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: str | None = None) -> tuple[str]:
+        """ボキャブラリの保存
+        Args:
+            save_directory (str): 保存するディレクトリ。ファイル名はvocab.txtに固定
+            filename_prefix (str | None, optional): ファイルのprefix
+        Returns:
+            tuple[str]: ファイル名を返す。
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") +
+                VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = (filename_prefix +
+                          "-" if filename_prefix else "") + save_directory
+        save_stoi(self.stoi, vocab_file)
+        return (vocab_file, )
+# AutoTokenizerに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
+# https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
+FastTextJpTokenizer.register_for_auto_class("AutoTokenizer")

mecab_tokenizer.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from __future__ import annotations
+from typing import NamedTuple
+import MeCab
+from transformers import PreTrainedTokenizer
+class MeCabResult(NamedTuple):
+    """MeCab解析結果の型
+    """
+    hyosokei: str
+    hinshi: str
+    hinshi_saibunrui_1: str
+    hinshi_saibunrui_2: str
+    hinshi_saibunrui_3: str
+    katsuyokei_1: str
+    katsuyokei_2: str
+    genkei: str
+    yomi: str
+    hatsuon: str
+class MeCabTokenizer(PreTrainedTokenizer):
+    def __init__(self,
+                 hinshi: list[str] | None = None,
+                 mecab_dicdir: str | None = None,
+                 **kwargs):
+        """初期化処理
+        Args:
+            hinshi (list[str] | None): 抽出する品詞
+            mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
+        """
+        self.target_hinshi = hinshi
+        if mecab_dicdir is not None:
+            self.mecab = MeCab.Tagger(f"-d {mecab_dicdir}")
+        else:
+            self.mecab = MeCab.Tagger()
+        super().__init__(**kwargs)
+    def _tokenize(self, text: str) -> list[str]:
+        """文章から特定の品詞の単語を返します。
+        Args:
+            text (str): 文章
+        Returns:
+            list[str]: 特定の品詞の単語
+        """
+        out = []
+        # Mecabで分析します。
+        result_words = self.mecab_analyze(text)
+        for result_word in result_words:
+            # 最初と最後は空文字
+            if result_word.hyosokei == "":
+                continue
+            if self.target_hinshi is not None:
+                if result_word.hinshi in self.target_hinshi:
+                    # 特定の品詞のみ返します。
+                    out.append(result_word.hyosokei)
+                else:
+                    continue
+            else:
+                out.append(result_word.hyosokei)
+        return out
+    def mecab_analyze(self, text: str) -> list[MeCabResult]:
+        """文章をMecabで分析します。
+        Args:
+            text (str): 文章
+        Returns:
+            list[MeCabResult]: MeCabの解析結果
+        """
+        node = self.mecab.parseToNode(text)
+        #形態素1つ1つを処理
+        out = []
+        while node:
+            args = []
+            args.append(node.surface)
+            feature = node.feature.split(",")
+            args.extend(feature)
+            mecab_result = MeCabResult(args[0], args[1], args[2], args[3],
+                                       args[4], args[5], args[6], args[7],
+                                       args[8], args[9])
+            out.append(mecab_result)
+            node = node.next  # 最後のEOSを省く
+        return out

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bd689ac294c1623aa08045af576207197ec480898bbb9f4057b062f63cfdf4f
+size 600829

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "pad_token": "*",
+  "unk_token": "*"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "fasttext_jp_tokenizer.FastTextJpTokenizer",
+      null
+    ]
+  },
+  "model_max_length": 128,
+  "pad_token": "*",
+  "tokenizer_class": "FastTextJpTokenizer",
+  "unk_token": "*"
+}

vocab.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a1770ed0a47f44e882afc3f56271a16bc8dba675f18dd61e2cffac276b49acc
+size 29910902