lst-nectec
/

HoogBERTa

@@ -1,10 +1,10 @@
 ---
 license: mit
 datasets:
 - scb_mt_enth_2020
 - oscar
 - wikipedia
-- best2009
 language:
 - th
 library_name: transformers
@@ -27,7 +27,8 @@ pip install attacut
 To initialize the model from hub, use the following commands
 ```python
 from transformers import AutoTokenizer, AutoModel
-from attacut import tokenize
 tokenizer = AutoTokenizer.from_pretrained("new5558/HoogBERTa")
 model = AutoModel.from_pretrained("new5558/HoogBERTa")
@@ -41,38 +42,39 @@ To annotate POS, NE, and clause boundary, use the following commands
 To extract token features, based on the RoBERTa architecture, use the following commands
 ```python
 with torch.no_grad():
-    model.eval()
-    sentence = "วันที่ 12 มีนาคมนี้ ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"
-    all_sent = []
-    sentences = sentence.split(" ")
-    for sent in sentences:
-        all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))
-    sentence = " _ ".join(all_sent)
-    tokenized_text = tokenizer(sentence, return_tensors = 'pt')
-    token_ids = tokenized_text['input_ids']
-    features = model(**tokenized_text)
 ```
 For batch processing,
 ```python
 with torch.no_grad():
-    model.eval()
-    sentenceL = ["วันที่ 12 มีนาคมนี้","ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"]
-    inputList = []
-    for sentX in sentenceL:
-        sentences = sentX.split(" ")
-        all_sent = []
-        for sent in sentences:
-            all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))
-        sentence = " _ ".join(all_sent)
-        inputList.append(sentence)
-    tokenized_text = tokenizer(inputList, padding = True, return_tensors = 'pt')
-    token_ids = tokenized_text['input_ids']
-    features = model(**tokenized_text)
 ```
 To use HoogBERTa as an embedding layer, use

 ---
 license: mit
 datasets:
+- best2009
 - scb_mt_enth_2020
 - oscar
 - wikipedia
 language:
 - th
 library_name: transformers
 To initialize the model from hub, use the following commands
 ```python
 from transformers import AutoTokenizer, AutoModel
+from attacut import tokenized
+import torch
 tokenizer = AutoTokenizer.from_pretrained("new5558/HoogBERTa")
 model = AutoModel.from_pretrained("new5558/HoogBERTa")
 To extract token features, based on the RoBERTa architecture, use the following commands
 ```python
+model.eval()
+sentence = "วันที่ 12 มีนาคมนี้ ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"
+all_sent = []
+sentences = sentence.split(" ")
+for sent in sentences:
+    all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))
+sentence = " _ ".join(all_sent)
+tokenized_text = tokenizer(sentence, return_tensors = 'pt')
+token_ids = tokenized_text['input_ids']
 with torch.no_grad():
+  features = model(**tokenized_text, output_hidden_states = True).hidden_states[-1]
 ```
 For batch processing,
 ```python
+model.eval()
+sentenceL = ["วันที่ 12 มีนาคมนี้","ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"]
+inputList = []
+for sentX in sentenceL:
+  sentences = sentX.split(" ")
+  all_sent = []
+  for sent in sentences:
+      all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))
+  sentence = " _ ".join(all_sent)
+  inputList.append(sentence)
+tokenized_text = tokenizer(inputList, padding = True, return_tensors = 'pt')
+token_ids = tokenized_text['input_ids']
 with torch.no_grad():
+    features = model(**tokenized_text, output_hidden_states = True).hidden_states[-1]
 ```
 To use HoogBERTa as an embedding layer, use