| |
| """evaluation_comp.ipynb |
| |
| Automatically generated by Colaboratory. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1qD1t_GF67fbwftmUYfuMDpwVFICPk5kJ |
| """ |
|
|
| !pip install gradio |
|
|
| !pip install transformers |
|
|
| import gradio as gr |
| import pandas as pd |
| from torch import nn |
| from transformers import BertModel |
| from transformers import BertTokenizer |
| from sklearn.metrics import f1_score |
| import torch |
| import nltk |
| nltk.download(['punkt', 'stopwords']) |
| import re |
|
|
| def remove_short_strings(df:pd.DataFrame, string_column:str)->pd.DataFrame: |
| df[string_column] = df[string_column].astype(str) |
| df['length'] = df[string_column].str.len() |
| df = df.drop(df[df['length'] == 1].index) |
| df = df.drop(columns=['length']) |
| return df |
| def remove_one_character_words(row): |
| words = row['text'].split() |
| return ' '.join([word for word in words if len(word) > 1]) |
| def ret_list_to_str(liste): |
| return " ".join (i for i in liste) |
| def preprocess_tweet(tweet): |
| |
| tweet = tweet.lower() |
| |
| tweet = re.sub(r'(.)\1+', r'\1\1', tweet) |
| |
| tweet = re.sub(r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s]', '', tweet) |
| |
| tweet = re.sub(r'\s+', ' ', tweet).strip() |
| return tweet |
| def cleaning_stopwords(text,stop_words): |
| return " ".join([word for word in str(text).split() if word not in stop_words]) |
| from nltk.corpus import stopwords |
| |
| turkish_stopwords = stopwords.words('turkish') |
| turkish_stopwords.append("bir") |
| turkish_stopwords=set(turkish_stopwords) |
| |
|
|
|
|
| from sklearn import preprocessing |
| from nltk.tokenize import word_tokenize |
|
|
|
|
| def prep_and_sw_and_tokenize(df): |
|
|
| turkish_stopwords = stopwords.words('turkish') |
| turkish_stopwords.append("bir") |
| stop_words=set(turkish_stopwords) |
| df["text"]=df["text"].apply(preprocess_tweet) |
| df['text'] = df["text"].apply(lambda text: cleaning_stopwords(text,stop_words)) |
|
|
| |
|
|
|
|
| return df |
|
|
| tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased") |
| class BertClassifierConv1D(nn.Module): |
| def __init__(self, dropout=0.5, num_classes=5): |
| super(BertClassifierConv1D, self).__init__() |
| |
| self.bert = BertModel.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', return_dict=True) |
| self.conv1d = nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=128, kernel_size=5) |
| self.bilstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True) |
| self.dropout = nn.Dropout(dropout) |
| self.linear = nn.Linear(128, num_classes) |
|
|
| def forward(self, input_id, mask): |
| output = self.bert(input_ids=input_id, attention_mask=mask).last_hidden_state |
| output = output.permute(0, 2, 1) |
| output = self.conv1d(output) |
| output, _ = self.bilstm(output.transpose(1, 2)) |
| output = self.dropout(output) |
| output = self.linear(output.mean(dim=1)) |
| return output |
| class Dataset(torch.utils.data.Dataset): |
| def __init__(self, df): |
| self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt") for text in df] |
|
|
| def __len__(self): |
| return len(self.texts) |
|
|
| def __getitem__(self, idx): |
| batch_texts = self.texts[idx] |
| return batch_texts |
| def evaluate(model, test_data): |
|
|
| test = Dataset(test_data) |
|
|
| test_dataloader = torch.utils.data.DataLoader(test, batch_size=32) |
|
|
| |
| |
| device= torch.device("cpu") |
|
|
| |
|
|
| |
|
|
| total_acc_test = 0 |
| output_indices = [] |
| with torch.no_grad(): |
|
|
| for test_input in test_dataloader: |
|
|
| mask = test_input['attention_mask'].to(device) |
| input_id = test_input['input_ids'].squeeze(1).to(device) |
|
|
| output = model(input_id, mask) |
| |
|
|
| batch_indices = output.argmax(dim=1).tolist() |
| output_indices.extend(batch_indices) |
|
|
| |
| |
| return output_indices |
|
|
| def auth(username, password): |
| if username == "Hive_Hereos" and password == "Y2IB3HV8GBXED00S": |
| return True |
| else: |
| return False |
|
|
| global model |
| model =BertClassifierConv1D() |
|
|
| model.load_state_dict(torch.load(r"sontotalmodel_finallll.pt", map_location=torch.device('cpu'))) |
|
|
| import logging |
| logging.basicConfig(filename=r'app.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO) |
|
|
|
|
| def predict(df): |
| |
| df["offensive"] = 1 |
| df["target"] = None |
| |
| try: |
| |
| text=df["text"] |
| df=prep_and_sw_and_tokenize(df) |
| |
| labels = {'INSULT':0, |
| 'OTHER':1, |
| 'PROFANITY':2, |
| 'RACIST':3, |
| 'SEXIST':4 |
| } |
| logging.info("Başlıyoruz") |
| |
| logging.info("Model yüklendi") |
| logging.info(df.text) |
| a=evaluate(model, df["text"]) |
| |
| test_labels=[] |
| for number in a: |
| label = list(labels.keys())[list(labels.values()).index(number)] |
| test_labels.append(label) |
| df["target"]=test_labels |
| |
| for index, row in df.iterrows(): |
| if row['target'] == 'OTHER': |
| df.at[index, 'offensive'] = 0 |
| df["text"]=text |
| except Exception as e: |
| logging.error("Error occurred", exc_info=True) |
| raise e |
| |
| |
|
|
|
|
| return df |
|
|
| def get_file(file): |
| output_file = "output_Hive_Hereos.csv" |
|
|
| |
| file_name = file.name.replace("\\", "/") |
|
|
| df = pd.read_csv(file_name, sep="|") |
|
|
| predict(df) |
| df.to_csv(output_file, index=False, sep="|") |
| return (output_file) |
|
|
| |
| iface = gr.Interface(get_file, "file", "file") |
|
|
| if __name__ == "__main__": |
| iface.launch(share=True, auth=auth,debug=True) |
|
|
| iface.close() |
|
|
| import session_info |
| session_info.show() |
|
|
|
|