Enric Perpinyà Pitarch commited on
Commit
ff98283
·
1 Parent(s): 8b0d124

Predictor upvotes :D

Browse files
Files changed (2) hide show
  1. app.py +38 -63
  2. nn_factory.py +140 -0
app.py CHANGED
@@ -1,78 +1,53 @@
1
  import gradio as gr
2
- import hopsworks as hops
3
- import pandas as pd
4
- import joblib
5
  import torch
6
- import torch.nn.functional as F
7
- import re
8
-
9
-
10
- # hf_hub_model_name = "princeton-nlp/sup-simcse-bert-base-uncased"
11
- # hf_hub_model = None
12
-
13
- # def load_encoding_model():
14
- # global hf_hub_model
15
- # if hf_hub_model is None:
16
- # from transformers import AutoModel, AutoTokenizer
17
- # hf_hub_model = {
18
- # "tokenizer": AutoTokenizer.from_pretrained(hf_hub_model_name),
19
- # "model": AutoModel.from_pretrained(hf_hub_model_name)
20
- # }
21
- # return hf_hub_model
22
-
23
- # @torch.no_grad()
24
- # def to_embedding(data):
25
- # hf_hub_model = load_encoding_model()
26
- # inputs = hf_hub_model['tokenizer'](data, padding=True, truncation=True, return_tensors="pt")
27
- # embedding = hf_hub_model['model'](**inputs, output_hidden_states=True, return_dict=True).pooler_output
28
- # return embedding
29
-
30
- # def extract_words_from_link(link):
31
- # # Match alphanumeric sequences
32
- # url_str = ""
33
- # words = re.findall(r'\b\w+\b', link)
34
- # remove_list = ['https', 'http', 'www']
35
- # final_words = [w for w in words if not(w in remove_list)]
36
- # for w in final_words:
37
- # url_str += w + " "
38
- # return url_str
39
-
40
- # project = hops.login(project="id2223_enric")
41
- # fs = project.get_feature_store()
42
-
43
- # mr = project.get_model_registry()
44
- # model = mr.get_model("hackernews_model", version=2)
45
- # model_dir = model.download()
46
-
47
- # model = joblib.load(model_dir+'/model.pkl')
48
- # print("Model Loaded...")
49
-
50
- # def predict_score(title: str, url: str) -> int:
51
- # title_embedding = to_embedding([title]).unsqueeze(0)
52
- # url_embedding = to_embedding([url]).unsqueeze(0)
53
-
54
- # embedding = torch.cat([title_embedding, url_embedding], dim=1)
55
- # embedding = F.softmax(embedding, dim=-1)
56
-
57
- # model = torch.load('nbs/model.pth')
58
- # output = model(embedding)
59
- # score = output * 280
60
- # return int(score)
61
-
62
- def predict_score(title: str, url: str) -> int:
63
- return 1
64
 
65
  with gr.Blocks() as iface:
66
  with gr.Column():
67
  with gr.Column():
68
  title = gr.Textbox(label="Title")
69
- url = gr.Textbox(label="URL")
70
  with gr.Row():
71
  button = gr.Button("Submit", variant="primary")
72
  clear = gr.Button("Clear")
73
  with gr.Column():
74
  output = gr.Slider(label="Possible score", minimum=0, maximum=1000, step=1)
75
 
76
- button.click(predict_score, [title, url], output)
77
 
78
  iface.launch()
 
1
  import gradio as gr
 
 
 
2
  import torch
3
+ import torch.nn as nn
4
+ from transformers import BertTokenizer, BertModel
5
+ from nn_factory import nn_factory
6
+
7
+ from huggingface_hub import hf_hub_download
8
+
9
+ class BERT_classifier(nn.Module):
10
+ def __init__(self, bertmodel, num_score):
11
+ super(BERT_classifier, self).__init__()
12
+ self.bertmodel = bertmodel
13
+ self.dropout = nn.Dropout(p=bertmodel.config.hidden_dropout_prob)
14
+ self.linear = nn.Linear(bertmodel.config.hidden_size, num_score)
15
+
16
+ def forward(self, wrapped_input):
17
+ hidden = self.bertmodel(**wrapped_input)
18
+ _, pooler_output = hidden[0], hidden[1]
19
+ output_value = self.linear(pooler_output).squeeze()
20
+ score = torch.sigmoid(output_value) * 1000
21
+ return score
22
+
23
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
24
+ bert = BertModel.from_pretrained("bert-base-uncased")
25
+
26
+ model_dir = hf_hub_download(
27
+ repo_id="ID2223/hackernews_upvotes_predictor_model",
28
+ filename="model_1.pt",
29
+ repo_type="model"
30
+ )
31
+ model = BERT_classifier(bert, 1)
32
+ model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))
33
+ model.eval()
34
+
35
+ nn_obj = nn_factory(model, 'cpu', tokenizer)
36
+
37
+ def predict_score(title: str) -> int:
38
+ predicted_score = nn_obj.predict(title)
39
+ return int(predicted_score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  with gr.Blocks() as iface:
42
  with gr.Column():
43
  with gr.Column():
44
  title = gr.Textbox(label="Title")
 
45
  with gr.Row():
46
  button = gr.Button("Submit", variant="primary")
47
  clear = gr.Button("Clear")
48
  with gr.Column():
49
  output = gr.Slider(label="Possible score", minimum=0, maximum=1000, step=1)
50
 
51
+ button.click(predict_score, [title], output)
52
 
53
  iface.launch()
nn_factory.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from tqdm import tqdm
6
+ import time
7
+ import os
8
+ import matplotlib.pyplot as plt
9
+ plt.style.use('seaborn-v0_8-paper')
10
+
11
+ class nn_factory():
12
+ def __init__(self, model, device, tokenizer):
13
+ self.model = model.to(device)
14
+ self.device = device
15
+ self.tokenizer = tokenizer
16
+
17
+ def fit(self, epoch, optimizer, train_loader, val_loader, model_save_path):
18
+ val_loss, val_acc = np.Inf, 0.
19
+ train_loss_hist, train_acc_hist = [],[]
20
+ val_loss_hist, val_acc_hist = [],[]
21
+
22
+ for ep in range(1, epoch + 1):
23
+ epoch_begin = time.time()
24
+ cur_train_loss, cur_train_acc = self.train(train_loader, optimizer, ep)
25
+ cur_val_loss, cur_val_acc = self.val(val_loader)
26
+
27
+ print('elapse: %.2fs \n' % (time.time() - epoch_begin))
28
+
29
+ if cur_val_loss <= val_loss:
30
+ print('improve validataion loss, saving model...\n')
31
+ torch.save(self.model.state_dict(),
32
+ os.path.join(model_save_path,
33
+ f'best_model_ep_{ep}_loss_{cur_val_loss}_acc_{cur_val_acc}.pt'))
34
+
35
+ val_loss = cur_val_loss
36
+ val_acc = cur_val_acc
37
+
38
+ train_loss_hist.append(cur_train_loss)
39
+ train_acc_hist.append(cur_train_acc)
40
+ val_loss_hist.append(cur_val_loss)
41
+ val_acc_hist.append(cur_val_acc)
42
+
43
+ # save final model
44
+ state = {
45
+ 'epoch': epoch,
46
+ 'state_dict': self.model.state_dict(),
47
+ 'optimizer': optimizer.state_dict()
48
+ }
49
+ torch.save(state, os.path.join(model_save_path, 'last_model.pt'))
50
+
51
+ ### graph train hist ###
52
+ fig = plt.figure()
53
+ plt.plot(train_loss_hist)
54
+ plt.plot(val_loss_hist)
55
+ plt.legend(['train loss','val loss'], loc='best')
56
+ plt.savefig(os.path.join(model_save_path, 'loss.jpg'))
57
+ plt.close(fig)
58
+ fig = plt.figure()
59
+ plt.plot(train_acc_hist)
60
+ plt.plot(val_acc_hist)
61
+ plt.legend(['train acc', 'val acc'], loc='best')
62
+ plt.savefig(os.path.join(model_save_path, 'acc.jpg'))
63
+ plt.close(fig)
64
+
65
+ def train(self, train_loader, optimizer, epoch):
66
+ print('[epoch %d]train on %d data......'%(epoch, len(train_loader.dataset)))
67
+ train_loss, correct = np.Inf, 0
68
+
69
+ self.model.train()
70
+ for data, label in tqdm(train_loader):
71
+ device_data = {}
72
+ for k, v in data.items():
73
+ device_data[k] = v.to(self.device)
74
+ device_label = label.to(self.device, dtype=torch.float32)
75
+
76
+ optimizer.zero_grad()
77
+ output = self.model(device_data)
78
+
79
+ criterion = nn.MSELoss()
80
+ loss = criterion(output, device_label)
81
+
82
+ train_loss += loss.item()
83
+ loss.backward()
84
+ optimizer.step()
85
+
86
+ pred = output.argmax(dim=0)
87
+ correct += pred.eq(device_label).sum().item()
88
+
89
+ train_loss /= len(train_loader.dataset)
90
+ acc = correct/len(train_loader.dataset)
91
+
92
+ print('training set: average loss: %.4f, acc: %d/%d(%.3f%%)' %(train_loss,
93
+ correct, len(train_loader.dataset), 100 * acc))
94
+
95
+ return train_loss, acc
96
+
97
+
98
+ def val(self, val_loader):
99
+ print('validation on %d data......'%len(val_loader.dataset))
100
+ self.model.eval()
101
+ val_loss, correct = np.Inf, 0.
102
+ with torch.no_grad():
103
+ for data, label in val_loader:
104
+ device_data = {}
105
+ for k, v in data.items():
106
+ device_data[k] = v.to(self.device)
107
+ device_label = label.to(self.device, dtype=torch.float32)
108
+
109
+ output = self.model(device_data)
110
+
111
+ criterion = nn.MSELoss()
112
+ val_loss += criterion(output, device_label).item() #sum up batch loss
113
+
114
+ pred = output.argmax(dim=0)
115
+ correct += pred.eq(device_label).sum().item()
116
+ val_loss /= len(val_loader.dataset) # avg of sum of batch loss
117
+ acc = correct/len(val_loader.dataset)
118
+
119
+ print('Val set:Average loss:%.4f, acc:%d/%d(%.3f%%)' %(val_loss,
120
+ correct, len(val_loader.dataset), 100. * acc))
121
+
122
+ return val_loss, acc
123
+
124
+
125
+ def predict_proba(self, sentence):
126
+ wrapped_input = self.tokenizer(sentence, max_length=30, add_special_tokens=True,
127
+ truncation=True, padding='max_length', return_tensors="pt")
128
+
129
+ with torch.no_grad():
130
+ log_prob = self.model(wrapped_input)
131
+ pred_prob = torch.exp(log_prob).data.cpu().numpy()
132
+
133
+ return pred_prob
134
+
135
+
136
+ def predict(self, sentence):
137
+ pred_prob = self.predict_proba(sentence)
138
+ score = np.argmax(pred_prob, axis=0)
139
+
140
+ return score