acho0057 commited on
Commit
a4c9d33
·
1 Parent(s): c555a74
.gitattributes CHANGED
@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ *.m filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: english
3
+ widget:
4
+ - text: "Covid cases are increasing fast!"
5
+ ---
6
+
7
+
8
+ # Twitter-roBERTa-base for Sentiment Analysis - UPDATED (2021)
9
+
10
+ This is a roBERTa-base model trained on ~124M tweets from January 2018 to December 2021 (see [here](https://huggingface.co/cardiffnlp/twitter-roberta-base-2021-124m)), and finetuned for sentiment analysis with the TweetEval benchmark.
11
+ The original roBERTa-base model can be found [here](https://huggingface.co/cardiffnlp/twitter-roberta-base-2021-124m) and the original reference paper is [TweetEval](https://github.com/cardiffnlp/tweeteval). This model is suitable for English.
12
+
13
+ - Reference Paper: [TimeLMs paper](https://arxiv.org/abs/2202.03829).
14
+ - Git Repo: [TimeLMs official repository](https://github.com/cardiffnlp/timelms).
15
+
16
+ <b>Labels</b>:
17
+ 0 -> Negative;
18
+ 1 -> Neutral;
19
+ 2 -> Positive
20
+
21
+ ## Example Pipeline
22
+ ```python
23
+ from transformers import pipeline
24
+ sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
25
+ sentiment_task("Covid cases are increasing fast!")
26
+ ```
27
+ ```
28
+ [{'label': 'Negative', 'score': 0.7236}]
29
+ ```
30
+
31
+ ## Full classification example
32
+
33
+ ```python
34
+ from transformers import AutoModelForSequenceClassification
35
+ from transformers import TFAutoModelForSequenceClassification
36
+ from transformers import AutoTokenizer, AutoConfig
37
+ import numpy as np
38
+ from scipy.special import softmax
39
+ # Preprocess text (username and link placeholders)
40
+ def preprocess(text):
41
+ new_text = []
42
+ for t in text.split(" "):
43
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
44
+ t = 'http' if t.startswith('http') else t
45
+ new_text.append(t)
46
+ return " ".join(new_text)
47
+ MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
48
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
49
+ config = AutoConfig.from_pretrained(MODEL)
50
+ # PT
51
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
52
+ #model.save_pretrained(MODEL)
53
+ text = "Covid cases are increasing fast!"
54
+ text = preprocess(text)
55
+ encoded_input = tokenizer(text, return_tensors='pt')
56
+ output = model(**encoded_input)
57
+ scores = output[0][0].detach().numpy()
58
+ scores = softmax(scores)
59
+ # # TF
60
+ # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
61
+ # model.save_pretrained(MODEL)
62
+ # text = "Covid cases are increasing fast!"
63
+ # encoded_input = tokenizer(text, return_tensors='tf')
64
+ # output = model(encoded_input)
65
+ # scores = output[0][0].numpy()
66
+ # scores = softmax(scores)
67
+ # Print labels and scores
68
+ ranking = np.argsort(scores)
69
+ ranking = ranking[::-1]
70
+ for i in range(scores.shape[0]):
71
+ l = config.id2label[ranking[i]]
72
+ s = scores[ranking[i]]
73
+ print(f"{i+1}) {l} {np.round(float(s), 4)}")
74
+ ```
75
+
76
+ Output:
77
+
78
+ ```
79
+ 1) Negative 0.7236
80
+ 2) Neutral 0.2287
81
+ 3) Positive 0.0477
82
+ ```
code/inference.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ def predict_fn(data, model_and_tokenizer):
4
+ # destruct model and tokenizer
5
+ model, tokenizer = model_and_tokenizer
6
+
7
+ # Tokenize sentences
8
+ sentences = data.pop("inputs", data)
9
+ encoded_input = tokenizer(sentences, add_special_tokens=False,return_tensors='pt')
10
+ input_id_chunks = list(encoded_input['input_ids'][0].split(510))
11
+ mask_chunks = list(encoded_input['attention_mask'][0].split(510))
12
+ for i in range(len(input_id_chunks)):
13
+ input_id_chunks[i]=torch.cat([torch.Tensor([101]),input_id_chunks[i],torch.Tensor([102])])
14
+ mask_chunks[i] = torch.cat([
15
+ torch.Tensor([1]), mask_chunks[i], torch.Tensor([1])
16
+ ])
17
+ pad_len = 512 - input_id_chunks[i].shape[0]
18
+ if pad_len > 0:
19
+ input_id_chunks[i] = torch.cat([input_id_chunks[i],torch.Tensor([0]*pad_len)])
20
+ mask_chunks[i] = torch.cat([mask_chunks[i],torch.Tensor([0]*pad_len)])
21
+
22
+ input_ids = torch.stack(input_id_chunks)
23
+ attention_masks = torch.stack(mask_chunks)
24
+
25
+ input_dict = {
26
+ 'input_ids': input_ids.long(),
27
+ 'attention_mask': attention_masks.int()
28
+ }
29
+ output = model(**input_dict)
30
+ print("inference.py")
31
+ return output
32
+
code/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ torch==1.12
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/jupyter/misc/tweeteval/TweetEval_models/sentiment/sentiment_latest_2021/",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "Negative",
16
+ "1": "Neutral",
17
+ "2": "Positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "Negative": 0,
23
+ "Neutral": 1,
24
+ "Positive": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.13.0.dev0",
35
+ "type_vocab_size": 1,
36
+ "vocab_size": 50265
37
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d24a3e32a88ed1c4e5b789fc6644e2e767500554e954b27dccf52a8e762cbae
3
+ size 501045531
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:682358ffb3869b08a144d5e59325534335729720fe64d5f2b3a543f8e5d14a9e
3
+ size 498845224
vocab.json ADDED
The diff for this file is too large to render. See raw diff