NLP_LSTM_team

Build error

App Files Files Community

MossyHead commited on May 31, 2024

Commit

377ca3f

1 Parent(s): d3a3d7c

'add'

Browse files

Files changed (13) hide show

models/__pycache__/models_predict_toxicity.cpython-312.pyc +0 -0
models/__pycache__/p_bert.cpython-312.pyc +0 -0
models/__pycache__/p_logreg.cpython-312.pyc +0 -0
models/__pycache__/p_lstm.cpython-312.pyc +0 -0
models/logreg_bert.pkl +3 -0
models/logreg_model.joblib +3 -0
models/logreg_vec.pkl +3 -0
models/model_bert.pth +3 -0
models/model_lstm.pt +3 -0
models/models_predict_toxicity.py +27 -0
models/p_bert.py +39 -0
models/p_logreg.py +16 -0
models/p_lstm.py +113 -0

models/__pycache__/models_predict_toxicity.cpython-312.pyc ADDED Viewed

Binary file (2.03 kB). View file

models/__pycache__/p_bert.cpython-312.pyc ADDED Viewed

Binary file (2.6 kB). View file

models/__pycache__/p_logreg.cpython-312.pyc ADDED Viewed

Binary file (929 Bytes). View file

models/__pycache__/p_lstm.cpython-312.pyc ADDED Viewed

Binary file (7.56 kB). View file

models/logreg_bert.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a6dc8a96c93ed97b248f73955cfe28998ab5bc360d2635dcc7129aa92425361
+size 8225

models/logreg_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e36ba8ccd4fd99dd6d91d6e22872fb714b7c40e152ad0ea2ab02e240637400f
+size 4391461

models/logreg_vec.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fc763c85441e38ede135901e446e05332a807f8bc5264d15d18646746f5c19d
+size 7548801

models/model_bert.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d74ff4026ce64a4c33dda7730aa03c771b097cc1f0ea3d79d69935482559209
+size 13420

models/model_lstm.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ca41a271e53df95eed8996bf8ed9ebe3be4df84726d9ce55319b7b7159de630
+size 14679450

models/models_predict_toxicity.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Загрузка модели и токенизатора rubert-tiny-toxicity
+model_save_path = "models"
+tokenizer_toxicity = AutoTokenizer.from_pretrained(model_save_path)
+model_toxicity = AutoModelForSequenceClassification.from_pretrained(model_save_path, use_safetensors=True)
+# Устройство (GPU если доступно, иначе CPU)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_toxicity.to(device)
+model_toxicity.eval()
+# Функция для предсказания токсичности
+def predict_toxicity(phrase):
+    inputs = tokenizer_toxicity(phrase, return_tensors="pt", padding=True, truncation=True, max_length=128)
+    inputs = {key: value.to(device) for key, value in inputs.items()}
+    with torch.no_grad():
+        outputs = model_toxicity(**inputs)
+    logits = outputs.logits
+    probabilities = torch.nn.functional.softmax(logits, dim=-1)
+    predicted_label = torch.argmax(probabilities, dim=-1).item()
+    label_map = {0: "нетоксичный", 1: "токсичный"}
+    prediction = label_map[predicted_label]
+    return prediction, probabilities[0][predicted_label].item()

models/p_bert.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModel
+from sklearn.linear_model import LogisticRegression
+import streamlit as st
+import pickle
+import streamlit as st
+@st.cache_resource
+def get_model():
+    model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
+    tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+    return model, tokenizer
+model, tokenizer = get_model()
+def predict_bert(input_text):
+    MAX_LEN = 300
+    tokenized_input = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
+    padded_input = np.array(tokenized_input + [0]*(MAX_LEN-len(tokenized_input)))
+    attention_mask = np.where(padded_input != 0, 1, 0)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model.to(device)
+    with torch.no_grad():
+        input_tensor = torch.tensor(padded_input).unsqueeze(0).to(device)
+        attention_mask_tensor = torch.tensor(attention_mask).unsqueeze(0).to(device)
+        last_hidden_states = model(input_tensor, attention_mask=attention_mask_tensor)[0]
+    features = last_hidden_states[:,0,:].cpu().numpy()
+    with open('models/logreg_bert.pkl', 'rb') as f:
+        loaded_model = pickle.load(f)
+    prediction = loaded_model.predict(features)
+    return prediction[0]

models/p_logreg.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from joblib import load
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+import pickle
+with open('models\logreg_vec.pkl', 'rb') as f:
+    vectorizer = pickle.load(f)
+# Load the model
+classifier = load('models\logreg_model.joblib')
+def predict_tfidf(text):
+    text_review_vectorized = vectorizer.transform([text])
+    prediction = classifier.predict(text_review_vectorized)
+    return prediction

models/p_lstm.py ADDED Viewed

	@@ -0,0 +1,113 @@

+ATTENTION_SIZE=10
+HIDDEN_SIZE=300
+INPUT_SIZE=312
+from math import e
+import torch
+from transformers import AutoTokenizer, AutoModel
+import torch.nn as nn
+import streamlit as st
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class RomanAttention(nn.Module):
+    def __init__(self, hidden_size: int = HIDDEN_SIZE) -> None:
+        super().__init__()
+        self.clf = nn.Sequential(
+            nn.Linear(HIDDEN_SIZE, 512),
+            nn.ReLU(),
+            nn.Linear(512, 1),
+        )
+    def forward(self, hidden, final_hidden):
+        final_hidden = final_hidden.squeeze(0).unsqueeze(1)
+        cat = torch.concat((hidden, final_hidden), dim=1)
+        clf = self.clf(cat)
+        vals = torch.argsort(clf, descending=False, dim=1)
+        index=vals[:,:ATTENTION_SIZE].squeeze(2)
+        index1=vals[:,ATTENTION_SIZE:].squeeze(2)
+        selected_values = cat[torch.arange(index.size(0)).unsqueeze(1), index]
+        select_clf = clf[torch.arange(index.size(0)).unsqueeze(1), index1]
+        unselected_values = cat[torch.arange(index.size(0)).unsqueeze(1), index1]*select_clf*select_clf
+        mean_unselected = torch.mean(unselected_values, dim=1)
+        return torch.cat((selected_values, mean_unselected.unsqueeze(1)), dim=1)
+import pytorch_lightning as  lg
+@st.cache_resource
+def load_model():
+    m = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
+    emb=m.embeddings
+    #emb.dropout=nn.Dropout(0)
+    for param in emb.parameters():
+        param.requires_grad = False
+    tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+    return emb.to(device), tokenizer
+emb, tokenizer = load_model()
+def tokenize(text):
+    t=tokenizer(text, padding=True, truncation=True,pad_to_multiple_of=300,max_length=300)['input_ids']
+    if len(t) <30:
+        t+=[0]*(30-len(t))
+    return t
+class MyModel(lg.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.lstm = nn.LSTM(INPUT_SIZE, HIDDEN_SIZE, batch_first=True)
+        self.attn = RomanAttention(HIDDEN_SIZE)
+        self.clf = nn.Sequential(
+            nn.Linear(HIDDEN_SIZE*(ATTENTION_SIZE+1), 100),
+            nn.Dropout(),
+            nn.ReLU(),
+            nn.Linear(100, 3)
+        )
+        self.criterion = nn.CrossEntropyLoss()
+        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
+        self.early_stopping = lg.callbacks.EarlyStopping(
+            monitor='val_acc',
+            min_delta=0.01,
+            patience=2,
+            verbose=True,
+            mode='max'
+        )
+        self.verbose=False
+    def forward(self, x):
+        if type(x) == str:
+            x = torch.tensor([tokenize(x)])
+        x = x.to(device)   ################################################
+        embeddings = emb(x).to(device)
+        output, (h_n, c_n) = self.lstm(embeddings)
+        attention = self.attn(output, c_n)
+        out =attention #torch.cat((output, attention), dim=1)
+        out = nn.Flatten()(out)
+        out_clf = self.clf(out)
+        return out_clf
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_pred = self(x)
+        loss = self.criterion(y_pred, y)
+        accuracy = (torch.argmax(y_pred, dim=1) == y).float().mean()
+        self.log('train_loss', loss, on_epoch=True, prog_bar=True)
+        self.log('train_accuracy', accuracy , on_epoch=True, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_pred = self(x)
+        loss = self.criterion(y_pred, y)
+        accuracy = ( torch.argmax(y_pred, dim=1) == y).float().mean()
+        self.log('val_loss', loss , on_epoch=True, prog_bar=True)
+        self.log('val_accuracy', accuracy , on_epoch=True, prog_bar=True)
+        return loss
+    def configure_optimizers(self):
+        return self.optimizer