Create README.md
Browse files
README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
https://github.com/BM-K/Sentence-Embedding-is-all-you-need
|
| 2 |
+
|
| 3 |
+
# Korean-Sentence-Embedding
|
| 4 |
+
๐ญ Korean sentence embedding repository. You can download the pre-trained models and inference right away, also it provides environments where individuals can train models.
|
| 5 |
+
|
| 6 |
+
## Quick tour
|
| 7 |
+
```python
|
| 8 |
+
import torch
|
| 9 |
+
from transformers import AutoModel, AutoTokenizer
|
| 10 |
+
|
| 11 |
+
def cal_score(a, b):
|
| 12 |
+
if len(a.shape) == 1: a = a.unsqueeze(0)
|
| 13 |
+
if len(b.shape) == 1: b = b.unsqueeze(0)
|
| 14 |
+
|
| 15 |
+
a_norm = a / a.norm(dim=1)[:, None]
|
| 16 |
+
b_norm = b / b.norm(dim=1)[:, None]
|
| 17 |
+
return torch.mm(a_norm, b_norm.transpose(0, 1)) * 100
|
| 18 |
+
|
| 19 |
+
model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta')
|
| 20 |
+
tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta')
|
| 21 |
+
|
| 22 |
+
sentences = ['์นํ๊ฐ ๋คํ์ ๊ฐ๋ก ์ง๋ฌ ๋จน์ด๋ฅผ ์ซ๋๋ค.',
|
| 23 |
+
'์นํ ํ ๋ง๋ฆฌ๊ฐ ๋จน์ด ๋ค์์ ๋ฌ๋ฆฌ๊ณ ์๋ค.',
|
| 24 |
+
'์์ญ์ด ํ ๋ง๋ฆฌ๊ฐ ๋๋ผ์ ์ฐ์ฃผํ๋ค.']
|
| 25 |
+
|
| 26 |
+
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
|
| 27 |
+
embeddings, _ = model(**inputs, return_dict=False)
|
| 28 |
+
|
| 29 |
+
score01 = cal_score(embeddings[0][0], embeddings[1][0])
|
| 30 |
+
score02 = cal_score(embeddings[0][0], embeddings[2][0])
|
| 31 |
+
```
|