Upload folder using huggingface_hub
Browse files- README.md +93 -0
- config.json +28 -0
- model.safetensors +3 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +15 -0
- tokenizer_config.json +57 -0
README.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- ja
|
| 4 |
+
tags:
|
| 5 |
+
- biomedical
|
| 6 |
+
- text
|
| 7 |
+
license: cc-by-4.0
|
| 8 |
+
datasets:
|
| 9 |
+
- JMED-DICT-mini
|
| 10 |
+
base_model: "xlm-roberta-base"
|
| 11 |
+
---
|
| 12 |
+
# MedTXTNorm
|
| 13 |
+
**MedTXTNorm** is a model for normalizing Japanese medical terms. It is trained on a subset of JMED-DICT (approximately 30k term-concept pairs) using SapBERT-XLMR as the base model. This model is fine-tuned from [cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR](https://huggingface.co/cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR), which utilizes [xlm-roberta-base](https://huggingface.co/xlm-roberta-base).
|
| 14 |
+
|
| 15 |
+
[ja]
|
| 16 |
+
MedTXTNormは、日本語の医療用語を正規化するためのモデルです。JMED-DICTのサブセット(約3万の用語-概念ペア)でSapBERT-XLMRをベースモデルとして学習されています。
|
| 17 |
+
**MedTXTNorm**は、日本語の医療用語を正規化するためのモデルです。SapBERT-XLMRをベースモデルとし、JMED-DICTのサブセット(約3万の用語-概念ペア)を用いてファインチューニングされています。
|
| 18 |
+
|
| 19 |
+
## How to use
|
| 20 |
+
|
| 21 |
+
The following script converts a list of strings (entity names) into embeddings and performs a similarity search.
|
| 22 |
+
|
| 23 |
+
[ja]
|
| 24 |
+
以下のスクリプトは、文字列(エンティティ名)のリストを埋め込みベクトルに変換し、類似度検索を行います。
|
| 25 |
+
jmed_dict_mini_demo: JMED-DICT-miniの一部の正規化候補
|
| 26 |
+
questions: 出現形 (ex. '脱水')
|
| 27 |
+
answers: 正規形 (ex. '脱水症')
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
```python
|
| 31 |
+
import os
|
| 32 |
+
import time
|
| 33 |
+
import torch
|
| 34 |
+
import torch.nn.functional as F
|
| 35 |
+
from transformers import AutoTokenizer, AutoModel
|
| 36 |
+
|
| 37 |
+
# 1. Setup
|
| 38 |
+
model_name = "sociocom/MedTXTNorm"
|
| 39 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 40 |
+
|
| 41 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 42 |
+
model = AutoModel.from_pretrained(model_name).to(device).eval()
|
| 43 |
+
|
| 44 |
+
# 2. Data
|
| 45 |
+
jmed_dict_mini_demo = ['脱水症', '高張性脱水症', '口渇症', '発汗障害', '羊水過少症', '破水', '水中毒', '両側水腎症', '下血', '溺水']
|
| 46 |
+
questions, answers = ['脱水'], ['脱水症']
|
| 47 |
+
top_k = 10
|
| 48 |
+
|
| 49 |
+
# 3. Inference (Embedding & Search)
|
| 50 |
+
def embed(texts):
|
| 51 |
+
with torch.no_grad():
|
| 52 |
+
inputs = tokenizer(texts, padding=True, truncation=True, max_length=25, return_tensors="pt").to(device)
|
| 53 |
+
return F.normalize(model(**inputs)[0][:, 0, :], p=2, dim=1)
|
| 54 |
+
|
| 55 |
+
torch.cuda.synchronize()
|
| 56 |
+
start = time.time()
|
| 57 |
+
|
| 58 |
+
# 計算:(Batch, dim) @ (N, dim).T -> (Batch, N)
|
| 59 |
+
# 埋め込みベクトルの作成
|
| 60 |
+
query_embs = embed(questions) # Shape: (Batch, Dim)
|
| 61 |
+
dict_embs = embed(jmed_dict_mini_demo) # Shape: (N, Dim)
|
| 62 |
+
|
| 63 |
+
# 類似度行列の計算 (行列積 = コサイン類似度)
|
| 64 |
+
# (Batch, Dim) @ (Dim, N) -> (Batch, N)
|
| 65 |
+
similarity_matrix = torch.matmul(query_embs, dict_embs.T)
|
| 66 |
+
|
| 67 |
+
# 上位k件の取得
|
| 68 |
+
top_vals, top_idxs = torch.topk(similarity_matrix, k=top_k)
|
| 69 |
+
|
| 70 |
+
torch.cuda.synchronize()
|
| 71 |
+
print(f"Time: {time.time() - start:.4f} sec")
|
| 72 |
+
|
| 73 |
+
# 4. Formatting
|
| 74 |
+
# ループ処理高速化のため、GPU上のTensorをPythonリストに変換
|
| 75 |
+
top_vals_list = top_vals.tolist()
|
| 76 |
+
top_idxs_list = top_idxs.tolist()
|
| 77 |
+
|
| 78 |
+
results = []
|
| 79 |
+
for i, (q, a) in enumerate(zip(questions, answers)):
|
| 80 |
+
candidates = []
|
| 81 |
+
|
| 82 |
+
# 重複チェック(set)を削除し、そのままリストに追加
|
| 83 |
+
for val, idx in zip(top_vals_list[i], top_idxs_list[i]):
|
| 84 |
+
name = jmed_dict_mini_demo[idx] # 変数名を修正しました
|
| 85 |
+
score = float(f"{val:.3g}") # 有効数字3桁
|
| 86 |
+
candidates.append((name, score))
|
| 87 |
+
|
| 88 |
+
results.append({"input": q, "answer": a, "candidates": candidates})
|
| 89 |
+
|
| 90 |
+
print(results)
|
| 91 |
+
# Time: 0.0303 sec
|
| 92 |
+
# [{'input': '脱水', 'answer': '脱水症', 'candidates': [('脱水症', 0.986), ('羊水過少症', 0.532), ('溺水', 0.491), ('口渇症', 0.49), ('水中毒', 0.482), ('発汗障害', 0.468), ('下血', 0.452), ('高張性脱水症', 0.447), ('両側水腎症', 0.442), ('破水', 0.409)]}]
|
| 93 |
+
```
|
config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"XLMRobertaModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"gradient_checkpointing": false,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.1,
|
| 12 |
+
"hidden_size": 1024,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 4096,
|
| 15 |
+
"layer_norm_eps": 1e-05,
|
| 16 |
+
"max_position_embeddings": 514,
|
| 17 |
+
"model_type": "xlm-roberta",
|
| 18 |
+
"num_attention_heads": 16,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"output_past": true,
|
| 21 |
+
"pad_token_id": 1,
|
| 22 |
+
"position_embedding_type": "absolute",
|
| 23 |
+
"torch_dtype": "float32",
|
| 24 |
+
"transformers_version": "4.51.3",
|
| 25 |
+
"type_vocab_size": 1,
|
| 26 |
+
"use_cache": true,
|
| 27 |
+
"vocab_size": 250002
|
| 28 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0bdc447471e5af437f3d57038273b30445deb80619c1ca2a18823d4aec84822d
|
| 3 |
+
size 2239607176
|
sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"cls_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"mask_token": {
|
| 6 |
+
"content": "<mask>",
|
| 7 |
+
"lstrip": true,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false
|
| 11 |
+
},
|
| 12 |
+
"pad_token": "<pad>",
|
| 13 |
+
"sep_token": "</s>",
|
| 14 |
+
"unk_token": "<unk>"
|
| 15 |
+
}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": false,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"do_lower_case": true,
|
| 48 |
+
"eos_token": "</s>",
|
| 49 |
+
"extra_special_tokens": {},
|
| 50 |
+
"mask_token": "<mask>",
|
| 51 |
+
"model_max_length": 512,
|
| 52 |
+
"pad_token": "<pad>",
|
| 53 |
+
"sep_token": "</s>",
|
| 54 |
+
"sp_model_kwargs": {},
|
| 55 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 56 |
+
"unk_token": "<unk>"
|
| 57 |
+
}
|