Spaces:
Build error
Build error
'add'
Browse files- models/__pycache__/models_predict_toxicity.cpython-312.pyc +0 -0
- models/__pycache__/p_bert.cpython-312.pyc +0 -0
- models/__pycache__/p_logreg.cpython-312.pyc +0 -0
- models/__pycache__/p_lstm.cpython-312.pyc +0 -0
- models/logreg_bert.pkl +3 -0
- models/logreg_model.joblib +3 -0
- models/logreg_vec.pkl +3 -0
- models/model_bert.pth +3 -0
- models/model_lstm.pt +3 -0
- models/models_predict_toxicity.py +27 -0
- models/p_bert.py +39 -0
- models/p_logreg.py +16 -0
- models/p_lstm.py +113 -0
models/__pycache__/models_predict_toxicity.cpython-312.pyc
ADDED
|
Binary file (2.03 kB). View file
|
|
|
models/__pycache__/p_bert.cpython-312.pyc
ADDED
|
Binary file (2.6 kB). View file
|
|
|
models/__pycache__/p_logreg.cpython-312.pyc
ADDED
|
Binary file (929 Bytes). View file
|
|
|
models/__pycache__/p_lstm.cpython-312.pyc
ADDED
|
Binary file (7.56 kB). View file
|
|
|
models/logreg_bert.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a6dc8a96c93ed97b248f73955cfe28998ab5bc360d2635dcc7129aa92425361
|
| 3 |
+
size 8225
|
models/logreg_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6e36ba8ccd4fd99dd6d91d6e22872fb714b7c40e152ad0ea2ab02e240637400f
|
| 3 |
+
size 4391461
|
models/logreg_vec.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7fc763c85441e38ede135901e446e05332a807f8bc5264d15d18646746f5c19d
|
| 3 |
+
size 7548801
|
models/model_bert.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d74ff4026ce64a4c33dda7730aa03c771b097cc1f0ea3d79d69935482559209
|
| 3 |
+
size 13420
|
models/model_lstm.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ca41a271e53df95eed8996bf8ed9ebe3be4df84726d9ce55319b7b7159de630
|
| 3 |
+
size 14679450
|
models/models_predict_toxicity.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 4 |
+
|
| 5 |
+
# Загрузка модели и токенизатора rubert-tiny-toxicity
|
| 6 |
+
model_save_path = "models"
|
| 7 |
+
|
| 8 |
+
tokenizer_toxicity = AutoTokenizer.from_pretrained(model_save_path)
|
| 9 |
+
model_toxicity = AutoModelForSequenceClassification.from_pretrained(model_save_path, use_safetensors=True)
|
| 10 |
+
|
| 11 |
+
# Устройство (GPU если доступно, иначе CPU)
|
| 12 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 13 |
+
model_toxicity.to(device)
|
| 14 |
+
model_toxicity.eval()
|
| 15 |
+
|
| 16 |
+
# Функция для предсказания токсичности
|
| 17 |
+
def predict_toxicity(phrase):
|
| 18 |
+
inputs = tokenizer_toxicity(phrase, return_tensors="pt", padding=True, truncation=True, max_length=128)
|
| 19 |
+
inputs = {key: value.to(device) for key, value in inputs.items()}
|
| 20 |
+
with torch.no_grad():
|
| 21 |
+
outputs = model_toxicity(**inputs)
|
| 22 |
+
logits = outputs.logits
|
| 23 |
+
probabilities = torch.nn.functional.softmax(logits, dim=-1)
|
| 24 |
+
predicted_label = torch.argmax(probabilities, dim=-1).item()
|
| 25 |
+
label_map = {0: "нетоксичный", 1: "токсичный"}
|
| 26 |
+
prediction = label_map[predicted_label]
|
| 27 |
+
return prediction, probabilities[0][predicted_label].item()
|
models/p_bert.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
from transformers import AutoTokenizer, AutoModel
|
| 4 |
+
from sklearn.linear_model import LogisticRegression
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import pickle
|
| 7 |
+
import streamlit as st
|
| 8 |
+
|
| 9 |
+
@st.cache_resource
|
| 10 |
+
def get_model():
|
| 11 |
+
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
|
| 12 |
+
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
| 13 |
+
return model, tokenizer
|
| 14 |
+
model, tokenizer = get_model()
|
| 15 |
+
def predict_bert(input_text):
|
| 16 |
+
MAX_LEN = 300
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
tokenized_input = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
|
| 21 |
+
padded_input = np.array(tokenized_input + [0]*(MAX_LEN-len(tokenized_input)))
|
| 22 |
+
attention_mask = np.where(padded_input != 0, 1, 0)
|
| 23 |
+
|
| 24 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 25 |
+
model.to(device)
|
| 26 |
+
|
| 27 |
+
with torch.no_grad():
|
| 28 |
+
input_tensor = torch.tensor(padded_input).unsqueeze(0).to(device)
|
| 29 |
+
attention_mask_tensor = torch.tensor(attention_mask).unsqueeze(0).to(device)
|
| 30 |
+
last_hidden_states = model(input_tensor, attention_mask=attention_mask_tensor)[0]
|
| 31 |
+
|
| 32 |
+
features = last_hidden_states[:,0,:].cpu().numpy()
|
| 33 |
+
|
| 34 |
+
with open('models/logreg_bert.pkl', 'rb') as f:
|
| 35 |
+
loaded_model = pickle.load(f)
|
| 36 |
+
|
| 37 |
+
prediction = loaded_model.predict(features)
|
| 38 |
+
|
| 39 |
+
return prediction[0]
|
models/p_logreg.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from joblib import load
|
| 2 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 3 |
+
import pickle
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
with open('models\logreg_vec.pkl', 'rb') as f:
|
| 7 |
+
vectorizer = pickle.load(f)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Load the model
|
| 11 |
+
classifier = load('models\logreg_model.joblib')
|
| 12 |
+
|
| 13 |
+
def predict_tfidf(text):
|
| 14 |
+
text_review_vectorized = vectorizer.transform([text])
|
| 15 |
+
prediction = classifier.predict(text_review_vectorized)
|
| 16 |
+
return prediction
|
models/p_lstm.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ATTENTION_SIZE=10
|
| 2 |
+
HIDDEN_SIZE=300
|
| 3 |
+
INPUT_SIZE=312
|
| 4 |
+
from math import e
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import AutoTokenizer, AutoModel
|
| 7 |
+
import torch.nn as nn
|
| 8 |
+
import streamlit as st
|
| 9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 10 |
+
class RomanAttention(nn.Module):
|
| 11 |
+
def __init__(self, hidden_size: int = HIDDEN_SIZE) -> None:
|
| 12 |
+
super().__init__()
|
| 13 |
+
self.clf = nn.Sequential(
|
| 14 |
+
nn.Linear(HIDDEN_SIZE, 512),
|
| 15 |
+
nn.ReLU(),
|
| 16 |
+
nn.Linear(512, 1),
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
def forward(self, hidden, final_hidden):
|
| 20 |
+
final_hidden = final_hidden.squeeze(0).unsqueeze(1)
|
| 21 |
+
|
| 22 |
+
cat = torch.concat((hidden, final_hidden), dim=1)
|
| 23 |
+
clf = self.clf(cat)
|
| 24 |
+
vals = torch.argsort(clf, descending=False, dim=1)
|
| 25 |
+
index=vals[:,:ATTENTION_SIZE].squeeze(2)
|
| 26 |
+
index1=vals[:,ATTENTION_SIZE:].squeeze(2)
|
| 27 |
+
selected_values = cat[torch.arange(index.size(0)).unsqueeze(1), index]
|
| 28 |
+
select_clf = clf[torch.arange(index.size(0)).unsqueeze(1), index1]
|
| 29 |
+
unselected_values = cat[torch.arange(index.size(0)).unsqueeze(1), index1]*select_clf*select_clf
|
| 30 |
+
mean_unselected = torch.mean(unselected_values, dim=1)
|
| 31 |
+
return torch.cat((selected_values, mean_unselected.unsqueeze(1)), dim=1)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
import pytorch_lightning as lg
|
| 35 |
+
|
| 36 |
+
@st.cache_resource
|
| 37 |
+
def load_model():
|
| 38 |
+
m = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
|
| 39 |
+
emb=m.embeddings
|
| 40 |
+
#emb.dropout=nn.Dropout(0)
|
| 41 |
+
for param in emb.parameters():
|
| 42 |
+
param.requires_grad = False
|
| 43 |
+
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
| 44 |
+
return emb.to(device), tokenizer
|
| 45 |
+
|
| 46 |
+
emb, tokenizer = load_model()
|
| 47 |
+
|
| 48 |
+
def tokenize(text):
|
| 49 |
+
t=tokenizer(text, padding=True, truncation=True,pad_to_multiple_of=300,max_length=300)['input_ids']
|
| 50 |
+
if len(t) <30:
|
| 51 |
+
t+=[0]*(30-len(t))
|
| 52 |
+
return t
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class MyModel(lg.LightningModule):
|
| 56 |
+
def __init__(self):
|
| 57 |
+
super().__init__()
|
| 58 |
+
|
| 59 |
+
self.lstm = nn.LSTM(INPUT_SIZE, HIDDEN_SIZE, batch_first=True)
|
| 60 |
+
self.attn = RomanAttention(HIDDEN_SIZE)
|
| 61 |
+
self.clf = nn.Sequential(
|
| 62 |
+
nn.Linear(HIDDEN_SIZE*(ATTENTION_SIZE+1), 100),
|
| 63 |
+
nn.Dropout(),
|
| 64 |
+
nn.ReLU(),
|
| 65 |
+
nn.Linear(100, 3)
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
self.criterion = nn.CrossEntropyLoss()
|
| 69 |
+
self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
|
| 70 |
+
self.early_stopping = lg.callbacks.EarlyStopping(
|
| 71 |
+
monitor='val_acc',
|
| 72 |
+
min_delta=0.01,
|
| 73 |
+
patience=2,
|
| 74 |
+
verbose=True,
|
| 75 |
+
mode='max'
|
| 76 |
+
)
|
| 77 |
+
self.verbose=False
|
| 78 |
+
|
| 79 |
+
def forward(self, x):
|
| 80 |
+
if type(x) == str:
|
| 81 |
+
x = torch.tensor([tokenize(x)])
|
| 82 |
+
x = x.to(device) ################################################
|
| 83 |
+
|
| 84 |
+
embeddings = emb(x).to(device)
|
| 85 |
+
output, (h_n, c_n) = self.lstm(embeddings)
|
| 86 |
+
attention = self.attn(output, c_n)
|
| 87 |
+
out =attention #torch.cat((output, attention), dim=1)
|
| 88 |
+
out = nn.Flatten()(out)
|
| 89 |
+
out_clf = self.clf(out)
|
| 90 |
+
return out_clf
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def training_step(self, batch, batch_idx):
|
| 94 |
+
x, y = batch
|
| 95 |
+
y_pred = self(x)
|
| 96 |
+
loss = self.criterion(y_pred, y)
|
| 97 |
+
|
| 98 |
+
accuracy = (torch.argmax(y_pred, dim=1) == y).float().mean()
|
| 99 |
+
self.log('train_loss', loss, on_epoch=True, prog_bar=True)
|
| 100 |
+
self.log('train_accuracy', accuracy , on_epoch=True, prog_bar=True)
|
| 101 |
+
return loss
|
| 102 |
+
|
| 103 |
+
def validation_step(self, batch, batch_idx):
|
| 104 |
+
x, y = batch
|
| 105 |
+
y_pred = self(x)
|
| 106 |
+
loss = self.criterion(y_pred, y)
|
| 107 |
+
accuracy = ( torch.argmax(y_pred, dim=1) == y).float().mean()
|
| 108 |
+
self.log('val_loss', loss , on_epoch=True, prog_bar=True)
|
| 109 |
+
self.log('val_accuracy', accuracy , on_epoch=True, prog_bar=True)
|
| 110 |
+
return loss
|
| 111 |
+
|
| 112 |
+
def configure_optimizers(self):
|
| 113 |
+
return self.optimizer
|