In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
import torch
from torch import nn
import matplotlib.pyplot as plt

# импортируем трансформеры
import transformers
import warnings
warnings.filterwarnings('ignore')
import re

#Messages

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/aux/labeled(1).csv')

In [3]:
df

Unnamed: 0,comment,toxic
0,"Верблюдов-то за что? Дебилы, бл...\n",1.0
1,"Хохлы, это отдушина затюканого россиянина, мол...",1.0
2,Собаке - собачья смерть\n,1.0
3,"Страницу обнови, дебил. Это тоже не оскорблени...",1.0
4,"тебя не убедил 6-страничный пдф в том, что Скр...",1.0
...,...,...
14407,Вонючий совковый скот прибежал и ноет. А вот и...,1.0
14408,А кого любить? Гоблина тупорылого что-ли? Или ...,1.0
14409,"Посмотрел Утомленных солнцем 2. И оказалось, ч...",0.0
14410,КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...,1.0


In [15]:
import nltk

In [16]:
# !pip install nltk

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
stop_words = set(stopwords.words("russian"))

In [20]:
def clean_text(text):
    # Удаление всего, что не является буквами или знаками препинания
    clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
    text = clean_pattern.sub('', text)
    url_pattern = re.compile(r'http\S+|www\S+|https\S+')
    text = url_pattern.sub(r'', text)
    text = re.sub("\s+", " ", text)
    splitted_text = [word for word in text.split() if word not in stop_words]
    text = " ".join(splitted_text)
    return text

In [21]:
df['comment'] = df['comment'].apply(clean_text)

In [22]:
df

Unnamed: 0,comment,toxic
0,"Верблюдовто что? Дебилы, бл...",1.0
1,"Хохлы, это отдушина затюканого россиянина, мол...",1.0
2,Собаке собачья смерть,1.0
3,"Страницу обнови, дебил. Это оскорбление, доказ...",1.0
4,"убедил 6страничный пдф том, Скрипалей отравила...",1.0
...,...,...
14407,Вонючий совковый скот прибежал ноет. А сторонн...,1.0
14408,А кого любить? Гоблина тупорылого чтоли? Или к...,1.0
14409,"Посмотрел Утомленных солнцем 2. И оказалось, э...",0.0
14410,КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...,1.0


In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

In [46]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['toxic'], test_size=0.2, random_state=42)

In [27]:
lengths = [len(review) for review in df['comment']]

# Шаг 3: Вычислить 75-й квантиль длины отзывов
quantile_75 = np.percentile(lengths, 75)
quantile_75

160.0

In [28]:
MAX_LEN = 100

In [29]:
tokenized_train = X_train.apply((lambda x: tokenizer.encode(x,
                                add_special_tokens=True,
                                truncation=True,
                                padding='max_length',
                                max_length=MAX_LEN)))

In [30]:
tokenized_valid = X_test.apply((lambda x: tokenizer.encode(x,
                                add_special_tokens=True,
                                truncation=True,
                                padding='max_length',
                                max_length=MAX_LEN)))

In [31]:
attention_mask_train = np.where(np.array(list(tokenized_train.values)) != 0, 1, 0)

In [32]:
attention_mask_valid = np.where(np.array(list(tokenized_valid.values)) != 0, 1, 0)

In [33]:
from torch.utils.data import Dataset

In [34]:
class BertInputs(torch.utils.data.Dataset):
    def __init__(self, tokenized_inputs, attention_masks, targets):
        self.tokenized_inputs = tokenized_inputs
        self.attention_masks = attention_masks
        self.targets = targets

    def __len__(self):
        return self.tokenized_inputs.shape[0]

    def __getitem__(self, idx):
        ids = self.tokenized_inputs[idx]
        ams = self.attention_masks[idx]
        target = self.targets[idx]

        return ids, ams, target


In [35]:
train_tokens = np.array(list(tokenized_train.values))

In [36]:
valid_tokens = np.array(list(tokenized_valid.values))

In [41]:
target_train = y_train.to_numpy()
target_valid = y_test.to_numpy()

In [39]:
train_dataset = BertInputs(torch.from_numpy(train_tokens), attention_mask_train, torch.from_numpy(target_train))

In [42]:
valid_dataset = BertInputs(torch.from_numpy(valid_tokens), attention_mask_valid, torch.from_numpy(target_valid))

In [43]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=False)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [107]:
class BERTClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained('cointegrated/rubert-tiny-toxicity')
        self.bert.classifier = nn.Linear(312, 312)
        for param in self.bert.parameters():
            param.requires_grad = False
        self.linear = nn.Sequential(
            nn.Linear(312, 128),
            nn.Sigmoid(),
            nn.Dropout(),
            nn.Linear(128, 1)
        )

    def forward(self, x, attention_mask=None):
        bert_out = self.bert(x, attention_mask=attention_mask).logits
        out = self.linear(bert_out).squeeze(1)
        return out

In [108]:
# !pip install torchmetrics

In [109]:
from torchmetrics import Accuracy
from torchmetrics.classification import BinaryF1Score

In [110]:
model = BERTClassifier()
device  = 'cuda'
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
metric = BinaryF1Score().to(device)
model.to(device);

In [111]:
def train_attention_lstm(epochs, model, train_loader, valid_loader, optimizer,
                         criterion, metric, lstm_conf=None):
    epoch_train_losses = []
    epoch_valid_losses = []
    epoch_train_metric = []
    epoch_valid_metric = []
    device = 'cuda'
    for epoch in range(epochs):
        batch_losses = []
        batch_metric = []
        model.train()
        model.to(device)
        for inputs, attention_masks, labels in train_loader:
            inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)

            output = model(inputs, attention_mask=attention_masks)
            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_losses.append(loss.item())
            batch_metric.append(metric(output, labels).item())

        epoch_train_losses.append(np.mean(batch_losses))
        epoch_train_metric.append(np.mean(batch_metric))

        batch_losses = []
        batch_metric = []
        model.eval()
        for inputs, attention_masks, labels in valid_loader:
            inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)
            with torch.no_grad():
                output = model(inputs, attention_mask=attention_masks)
            loss = criterion(output, labels)
            batch_losses.append(loss.item())
            batch_metric.append(metric(output, labels).item())
        epoch_valid_losses.append(np.mean(batch_losses))
        epoch_valid_metric.append(np.mean(batch_metric))

        print(f'Epoch {epoch+1}')
        print(f'Train loss: {epoch_train_losses[-1]:.4f}, Val loss {epoch_valid_losses[-1]:.4f}')
        print(f'Train accuracy: {epoch_train_metric[-1]:.2f}, Val accuracy: {epoch_valid_metric[-1]:.2f}')
        print(25*'==')

    return (epoch_train_losses, epoch_valid_losses, epoch_train_metric, epoch_valid_metric)

In [112]:
train_attention_lstm(20, model, train_loader, valid_loader, optimizer, criterion, metric)

Epoch 1
Train loss: 0.5084, Val loss 0.4024
Train accuracy: 0.57, Val accuracy: 0.70
Epoch 2
Train loss: 0.4391, Val loss 0.3867
Train accuracy: 0.66, Val accuracy: 0.72
Epoch 3
Train loss: 0.4279, Val loss 0.3793
Train accuracy: 0.68, Val accuracy: 0.73
Epoch 4
Train loss: 0.4193, Val loss 0.3731
Train accuracy: 0.69, Val accuracy: 0.73
Epoch 5
Train loss: 0.4131, Val loss 0.3679
Train accuracy: 0.70, Val accuracy: 0.74
Epoch 6
Train loss: 0.4070, Val loss 0.3634
Train accuracy: 0.70, Val accuracy: 0.74
Epoch 7
Train loss: 0.4082, Val loss 0.3593
Train accuracy: 0.70, Val accuracy: 0.75
Epoch 8
Train loss: 0.4026, Val loss 0.3563
Train accuracy: 0.71, Val accuracy: 0.75
Epoch 9
Train loss: 0.4043, Val loss 0.3537
Train accuracy: 0.71, Val accuracy: 0.76
Epoch 10
Train loss: 0.4014, Val loss 0.3518
Train accuracy: 0.72, Val accuracy: 0.76
Epoch 11
Train loss: 0.3974, Val loss 0.3497
Train accuracy: 0.72, Val accuracy: 0.76
Epoch 12
Train loss: 0.3948, Val loss 0.3478
Train accuracy: 0.

([0.5083780060451313,
  0.4391106702167688,
  0.42793449899041985,
  0.4192526624897032,
  0.4130774913023195,
  0.40695059533327993,
  0.40816243859394574,
  0.4025705500237704,
  0.40428590983994134,
  0.4014262039793654,
  0.3973992633044861,
  0.39483769866059215,
  0.3951748218383066,
  0.39460813710124554,
  0.3929524585412899,
  0.3901995505507913,
  0.39005828239460105,
  0.3898510054791003,
  0.39051921931187866,
  0.3903515106272374],
 [0.402441298047971,
  0.3867063879904477,
  0.3793247446857884,
  0.3730928883862242,
  0.3679111393862797,
  0.3633536281389945,
  0.3593491589004045,
  0.3562914771151528,
  0.3536964474871052,
  0.35182306410159725,
  0.34968449011011793,
  0.3477907560270232,
  0.34674735903661463,
  0.3454434093392382,
  0.34439929916767315,
  0.34329559643294033,
  0.34237983056187166,
  0.3415207479647173,
  0.3408517934908176,
  0.3403323164568777],
 [0.5658733633829286,
  0.6621058487908662,
  0.6838039802464752,
  0.6886554599386173,
  0.6963648699492

In [113]:
# Сохранение весов модели
torch.save(model.state_dict(), 'model_weights.pth')
