Update README.md

218ff4f almost 3 years ago

1.18 kB

language:
  - ko
library_name: transformers
pipeline_tag: token-classification

import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("fiveflow/roberta-base-spacing")
roberta = AutoModelForTokenClassification.from_pretrained("fiveflow/roberta-base-spacing")

org_text = "탄소중립과ESG경영에대한사회적요구확대".replace(" ", "") # 공백제거
label = ["UNK", "PAD", "O", "B", "I", "E", "S"]
# char 단위로 토큰화
token_list = [tokenizer.cls_token_id]
for char in org_text:
    token_list.append(tokenizer.encode(char)[1]) 
token_list.append(tokenizer.eos_token_id)
tkd = torch.tensor(token_list).unsqueeze(0)

output = roberta(tkd).logits

_, pred_idx = torch.max(output, dim=2)
tags = [label[idx] for idx in pred_idx.squeeze()][1:-1]
pred_sent = ""
for char_idx, spc_idx in enumerate(pred_idx.squeeze()[1:-1]):
    # "E" tag 단위로 띄어쓰기
    if label[spc_idx] == "E": pred_sent += org_text[char_idx] + " "
    else: pred_sent += org_text[char_idx]

print(pred_sent.strip())
# '탄소중립과 ESG 경영에 대한 사회적 요구 확대'