melll-uff
/

bertweetbr

Model card Files Files and versions

Fernando Carneiro commited on Sep 11, 2022

Commit

5af2a4f

·

1 Parent(s): 24940da

README

Files changed (1) hide show

README.md +15 -2

README.md CHANGED Viewed

@@ -20,9 +20,8 @@ tokenizer = AutoTokenizer.from_pretrained('melll-uff/bertweetbr', normalization=
 # INPUT TWEETS ALREADY NORMALIZED!
 inputs = [
-    "Tem vídeo novo no canal do @USER :rosto_sorridente_com_olhos_de_coração: Passem por lá e confiram : HTTPURL",
-    "Que jogo ontem @USER :mãos_juntas:",
     "Procuro um amor , que seja bom pra mim ... vou procurar , eu vou até o fim :nota_musical:",
     "Demojizer para Python é :polegar_para_cima: e está disponível em HTTPURL"]
 encoded_inputs = tokenizer(inputs, return_tensors="pt", padding=True)
@@ -32,6 +31,10 @@ with torch.no_grad():
 # CLS Token of last hidden states. Shape: (number of input sentences, hidden sizeof the model)
 last_hidden_states[0][:,0,:]
 ```
  ### Normalize raw input Tweets
@@ -58,7 +61,17 @@ tokenizer.demojizer = lambda x: demojize(x, language='pt')
  'Que jogo ontem @USER :mãos_juntas:',
  'Demojizer para Python é :polegar_para_cima: e está disponível em HTTPURL']
 ```
  ### Mask Filling with Pipeline

 # INPUT TWEETS ALREADY NORMALIZED!
 inputs = [
     "Procuro um amor , que seja bom pra mim ... vou procurar , eu vou até o fim :nota_musical:",
+    "Que jogo ontem @USER :mãos_juntas:",
     "Demojizer para Python é :polegar_para_cima: e está disponível em HTTPURL"]
 encoded_inputs = tokenizer(inputs, return_tensors="pt", padding=True)
 # CLS Token of last hidden states. Shape: (number of input sentences, hidden sizeof the model)
 last_hidden_states[0][:,0,:]
+tensor([[-0.1430, -0.1325,  0.1595,  ..., -0.0802, -0.0153, -0.1358],
+        [-0.0108,  0.1415,  0.0695,  ...,  0.1420,  0.1153, -0.0176],
+        [-0.1854,  0.1866,  0.3163,  ..., -0.2117,  0.2123, -0.1907]])
 ```
  ### Normalize raw input Tweets
  'Que jogo ontem @USER :mãos_juntas:',
  'Demojizer para Python é :polegar_para_cima: e está disponível em HTTPURL']
+encoded_inputs = tokenizer(inputs, return_tensors="pt", padding=True)
+with torch.no_grad():
+    last_hidden_states = model(**encoded_inputs)
+# CLS Token of last hidden states. Shape: (number of input sentences, hidden sizeof the model)
+last_hidden_states[0][:,0,:]
+tensor([[-0.1430, -0.1325,  0.1595,  ..., -0.0802, -0.0153, -0.1358],
+        [-0.0108,  0.1415,  0.0695,  ...,  0.1420,  0.1153, -0.0176],
+        [-0.1854,  0.1866,  0.3163,  ..., -0.2117,  0.2123, -0.1907]])
 ```
  ### Mask Filling with Pipeline