MossyHead commited on
Commit
377ca3f
·
1 Parent(s): d3a3d7c
models/__pycache__/models_predict_toxicity.cpython-312.pyc ADDED
Binary file (2.03 kB). View file
 
models/__pycache__/p_bert.cpython-312.pyc ADDED
Binary file (2.6 kB). View file
 
models/__pycache__/p_logreg.cpython-312.pyc ADDED
Binary file (929 Bytes). View file
 
models/__pycache__/p_lstm.cpython-312.pyc ADDED
Binary file (7.56 kB). View file
 
models/logreg_bert.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a6dc8a96c93ed97b248f73955cfe28998ab5bc360d2635dcc7129aa92425361
3
+ size 8225
models/logreg_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e36ba8ccd4fd99dd6d91d6e22872fb714b7c40e152ad0ea2ab02e240637400f
3
+ size 4391461
models/logreg_vec.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fc763c85441e38ede135901e446e05332a807f8bc5264d15d18646746f5c19d
3
+ size 7548801
models/model_bert.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d74ff4026ce64a4c33dda7730aa03c771b097cc1f0ea3d79d69935482559209
3
+ size 13420
models/model_lstm.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ca41a271e53df95eed8996bf8ed9ebe3be4df84726d9ce55319b7b7159de630
3
+ size 14679450
models/models_predict_toxicity.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+
5
+ # Загрузка модели и токенизатора rubert-tiny-toxicity
6
+ model_save_path = "models"
7
+
8
+ tokenizer_toxicity = AutoTokenizer.from_pretrained(model_save_path)
9
+ model_toxicity = AutoModelForSequenceClassification.from_pretrained(model_save_path, use_safetensors=True)
10
+
11
+ # Устройство (GPU если доступно, иначе CPU)
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+ model_toxicity.to(device)
14
+ model_toxicity.eval()
15
+
16
+ # Функция для предсказания токсичности
17
+ def predict_toxicity(phrase):
18
+ inputs = tokenizer_toxicity(phrase, return_tensors="pt", padding=True, truncation=True, max_length=128)
19
+ inputs = {key: value.to(device) for key, value in inputs.items()}
20
+ with torch.no_grad():
21
+ outputs = model_toxicity(**inputs)
22
+ logits = outputs.logits
23
+ probabilities = torch.nn.functional.softmax(logits, dim=-1)
24
+ predicted_label = torch.argmax(probabilities, dim=-1).item()
25
+ label_map = {0: "нетоксичный", 1: "токсичный"}
26
+ prediction = label_map[predicted_label]
27
+ return prediction, probabilities[0][predicted_label].item()
models/p_bert.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModel
4
+ from sklearn.linear_model import LogisticRegression
5
+ import streamlit as st
6
+ import pickle
7
+ import streamlit as st
8
+
9
+ @st.cache_resource
10
+ def get_model():
11
+ model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
12
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
13
+ return model, tokenizer
14
+ model, tokenizer = get_model()
15
+ def predict_bert(input_text):
16
+ MAX_LEN = 300
17
+
18
+
19
+
20
+ tokenized_input = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
21
+ padded_input = np.array(tokenized_input + [0]*(MAX_LEN-len(tokenized_input)))
22
+ attention_mask = np.where(padded_input != 0, 1, 0)
23
+
24
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
25
+ model.to(device)
26
+
27
+ with torch.no_grad():
28
+ input_tensor = torch.tensor(padded_input).unsqueeze(0).to(device)
29
+ attention_mask_tensor = torch.tensor(attention_mask).unsqueeze(0).to(device)
30
+ last_hidden_states = model(input_tensor, attention_mask=attention_mask_tensor)[0]
31
+
32
+ features = last_hidden_states[:,0,:].cpu().numpy()
33
+
34
+ with open('models/logreg_bert.pkl', 'rb') as f:
35
+ loaded_model = pickle.load(f)
36
+
37
+ prediction = loaded_model.predict(features)
38
+
39
+ return prediction[0]
models/p_logreg.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from joblib import load
2
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
3
+ import pickle
4
+
5
+
6
+ with open('models\logreg_vec.pkl', 'rb') as f:
7
+ vectorizer = pickle.load(f)
8
+
9
+
10
+ # Load the model
11
+ classifier = load('models\logreg_model.joblib')
12
+
13
+ def predict_tfidf(text):
14
+ text_review_vectorized = vectorizer.transform([text])
15
+ prediction = classifier.predict(text_review_vectorized)
16
+ return prediction
models/p_lstm.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ATTENTION_SIZE=10
2
+ HIDDEN_SIZE=300
3
+ INPUT_SIZE=312
4
+ from math import e
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModel
7
+ import torch.nn as nn
8
+ import streamlit as st
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ class RomanAttention(nn.Module):
11
+ def __init__(self, hidden_size: int = HIDDEN_SIZE) -> None:
12
+ super().__init__()
13
+ self.clf = nn.Sequential(
14
+ nn.Linear(HIDDEN_SIZE, 512),
15
+ nn.ReLU(),
16
+ nn.Linear(512, 1),
17
+ )
18
+
19
+ def forward(self, hidden, final_hidden):
20
+ final_hidden = final_hidden.squeeze(0).unsqueeze(1)
21
+
22
+ cat = torch.concat((hidden, final_hidden), dim=1)
23
+ clf = self.clf(cat)
24
+ vals = torch.argsort(clf, descending=False, dim=1)
25
+ index=vals[:,:ATTENTION_SIZE].squeeze(2)
26
+ index1=vals[:,ATTENTION_SIZE:].squeeze(2)
27
+ selected_values = cat[torch.arange(index.size(0)).unsqueeze(1), index]
28
+ select_clf = clf[torch.arange(index.size(0)).unsqueeze(1), index1]
29
+ unselected_values = cat[torch.arange(index.size(0)).unsqueeze(1), index1]*select_clf*select_clf
30
+ mean_unselected = torch.mean(unselected_values, dim=1)
31
+ return torch.cat((selected_values, mean_unselected.unsqueeze(1)), dim=1)
32
+
33
+
34
+ import pytorch_lightning as lg
35
+
36
+ @st.cache_resource
37
+ def load_model():
38
+ m = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
39
+ emb=m.embeddings
40
+ #emb.dropout=nn.Dropout(0)
41
+ for param in emb.parameters():
42
+ param.requires_grad = False
43
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
44
+ return emb.to(device), tokenizer
45
+
46
+ emb, tokenizer = load_model()
47
+
48
+ def tokenize(text):
49
+ t=tokenizer(text, padding=True, truncation=True,pad_to_multiple_of=300,max_length=300)['input_ids']
50
+ if len(t) <30:
51
+ t+=[0]*(30-len(t))
52
+ return t
53
+
54
+
55
+ class MyModel(lg.LightningModule):
56
+ def __init__(self):
57
+ super().__init__()
58
+
59
+ self.lstm = nn.LSTM(INPUT_SIZE, HIDDEN_SIZE, batch_first=True)
60
+ self.attn = RomanAttention(HIDDEN_SIZE)
61
+ self.clf = nn.Sequential(
62
+ nn.Linear(HIDDEN_SIZE*(ATTENTION_SIZE+1), 100),
63
+ nn.Dropout(),
64
+ nn.ReLU(),
65
+ nn.Linear(100, 3)
66
+ )
67
+
68
+ self.criterion = nn.CrossEntropyLoss()
69
+ self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
70
+ self.early_stopping = lg.callbacks.EarlyStopping(
71
+ monitor='val_acc',
72
+ min_delta=0.01,
73
+ patience=2,
74
+ verbose=True,
75
+ mode='max'
76
+ )
77
+ self.verbose=False
78
+
79
+ def forward(self, x):
80
+ if type(x) == str:
81
+ x = torch.tensor([tokenize(x)])
82
+ x = x.to(device) ################################################
83
+
84
+ embeddings = emb(x).to(device)
85
+ output, (h_n, c_n) = self.lstm(embeddings)
86
+ attention = self.attn(output, c_n)
87
+ out =attention #torch.cat((output, attention), dim=1)
88
+ out = nn.Flatten()(out)
89
+ out_clf = self.clf(out)
90
+ return out_clf
91
+
92
+
93
+ def training_step(self, batch, batch_idx):
94
+ x, y = batch
95
+ y_pred = self(x)
96
+ loss = self.criterion(y_pred, y)
97
+
98
+ accuracy = (torch.argmax(y_pred, dim=1) == y).float().mean()
99
+ self.log('train_loss', loss, on_epoch=True, prog_bar=True)
100
+ self.log('train_accuracy', accuracy , on_epoch=True, prog_bar=True)
101
+ return loss
102
+
103
+ def validation_step(self, batch, batch_idx):
104
+ x, y = batch
105
+ y_pred = self(x)
106
+ loss = self.criterion(y_pred, y)
107
+ accuracy = ( torch.argmax(y_pred, dim=1) == y).float().mean()
108
+ self.log('val_loss', loss , on_epoch=True, prog_bar=True)
109
+ self.log('val_accuracy', accuracy , on_epoch=True, prog_bar=True)
110
+ return loss
111
+
112
+ def configure_optimizers(self):
113
+ return self.optimizer