File size: 4,611 Bytes
02c45ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import gradio as gr
import torch
import torch.nn as nn
import numpy as np
import pickle
import re
import os
from nltk.tokenize.toktok import ToktokTokenizer
class CoolLSTMClassifier(nn.Module):
def __init__(self, vocabSize, embeddingDim, dimHidden, layerAmt, num_classes=2, dropout=0.3):
super(CoolLSTMClassifier, self).__init__()
self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx=0)
self.embedding_dropout = nn.Dropout(0.3)
self.dimHidden = dimHidden
self.lstm = nn.LSTM(
embeddingDim,
dimHidden,
layerAmt,
batch_first=True,
bidirectional=True,
dropout=dropout if layerAmt > 1 else 0
)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(dimHidden * 2, num_classes)
def forward(self, x):
embedded = self.embedding(x)
embedded = self.embedding_dropout(embedded)
lstm_out, (hidden, cell) = self.lstm(embedded)
forward_hidden = hidden[-2, :, :]
backward_hidden = hidden[-1, :, :]
combined = torch.cat([forward_hidden, backward_hidden], dim=1)
combined = self.dropout(combined)
output = self.fc(combined)
return output
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = ToktokTokenizer()
vocab = None
models = None
embeddingMatrix = None
def load_resources():
global vocab, models, embeddingMatrix
if vocab is not None and models is not None:
return
print("loading vocab and models...")
with open('data/processed/vocab.pkl', 'rb') as f:
vocab = pickle.load(f)
embeddingMatrix = np.load('data/processed/embedding_matrix.npy')
vocabSize = len(vocab)
embeddingDim = 300
dimHidden = 96
layerAmt = 1
num_classes = 2
dropout = 0.5
models = []
for i in range(1, 6):
model = CoolLSTMClassifier(vocabSize, embeddingDim, dimHidden, layerAmt, num_classes, dropout)
model.load_state_dict(torch.load(f'models/ensemble_model_{i}.pth', map_location=device))
model.embedding.weight.data.copy_(torch.from_numpy(embeddingMatrix))
model.embedding.weight.requires_grad = False
model = model.to(device)
model.eval()
models.append(model)
print("models loaded")
def cleanText(text):
if not text:
return ""
text = str(text)
text = re.sub(r'<[^>]+>', '', text)
text = ' '.join(text.split())
return text
def cleanTokenize(text):
text = str(text).lower()
text = re.sub(r'[^a-z0-9\s]', '', text)
tokens = tokenizer.tokenize(text)
return tokens
def predict_review(text):
load_resources()
cleaned = cleanText(text)
tokens = cleanTokenize(cleaned)
if len(tokens) == 0:
return "invalid input", 0.0, "n/a"
indices = [vocab.get(token, vocab['<UNK>']) for token in tokens]
maxLen = 256
if len(indices) > maxLen:
indices = indices[:maxLen]
else:
indices = indices + [vocab['<PAD>']] * (maxLen - len(indices))
inpTensor = torch.LongTensor([indices]).to(device)
allOutputs = []
with torch.no_grad():
for model in models:
outputs = model(inpTensor)
probs = torch.softmax(outputs, dim=1)
allOutputs.append(probs.cpu().numpy())
avgProbs = np.mean(allOutputs, axis=0)[0]
fakeProb = avgProbs[1]
realProb = avgProbs[0]
confidence = max(fakeProb, realProb)
fakeThreshold = 0.75
realThreshold = 0.75
if fakeProb >= fakeThreshold:
prediction = "fake"
elif realProb >= realThreshold:
prediction = "real"
else:
prediction = "uncertain"
return prediction, float(confidence), f"fake: {fakeProb:.3f}, real: {realProb:.3f}"
demo = gr.Interface(
fn=predict_review,
inputs=gr.Textbox(
lines=5,
placeholder="paste review text here",
label="review text"
),
outputs=[
gr.Textbox(label="prediction"),
gr.Number(label="confidence"),
gr.Textbox(label="probabilities")
],
title="sentinelcheck",
description="fake review detector using ensemble lstm models (75% threshold)",
examples=[
["this product is absolutely amazing! i received it for free and it changed my life completely. five stars!"],
["decent quality for the price. took about a week to arrive. works as expected."]
]
)
if __name__ == "__main__":
demo.launch()
|