main
fresh deploy with external models
02c45ef
raw
history blame
4.61 kB
import gradio as gr
import torch
import torch.nn as nn
import numpy as np
import pickle
import re
import os
from nltk.tokenize.toktok import ToktokTokenizer
class CoolLSTMClassifier(nn.Module):
def __init__(self, vocabSize, embeddingDim, dimHidden, layerAmt, num_classes=2, dropout=0.3):
super(CoolLSTMClassifier, self).__init__()
self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx=0)
self.embedding_dropout = nn.Dropout(0.3)
self.dimHidden = dimHidden
self.lstm = nn.LSTM(
embeddingDim,
dimHidden,
layerAmt,
batch_first=True,
bidirectional=True,
dropout=dropout if layerAmt > 1 else 0
)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(dimHidden * 2, num_classes)
def forward(self, x):
embedded = self.embedding(x)
embedded = self.embedding_dropout(embedded)
lstm_out, (hidden, cell) = self.lstm(embedded)
forward_hidden = hidden[-2, :, :]
backward_hidden = hidden[-1, :, :]
combined = torch.cat([forward_hidden, backward_hidden], dim=1)
combined = self.dropout(combined)
output = self.fc(combined)
return output
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = ToktokTokenizer()
vocab = None
models = None
embeddingMatrix = None
def load_resources():
global vocab, models, embeddingMatrix
if vocab is not None and models is not None:
return
print("loading vocab and models...")
with open('data/processed/vocab.pkl', 'rb') as f:
vocab = pickle.load(f)
embeddingMatrix = np.load('data/processed/embedding_matrix.npy')
vocabSize = len(vocab)
embeddingDim = 300
dimHidden = 96
layerAmt = 1
num_classes = 2
dropout = 0.5
models = []
for i in range(1, 6):
model = CoolLSTMClassifier(vocabSize, embeddingDim, dimHidden, layerAmt, num_classes, dropout)
model.load_state_dict(torch.load(f'models/ensemble_model_{i}.pth', map_location=device))
model.embedding.weight.data.copy_(torch.from_numpy(embeddingMatrix))
model.embedding.weight.requires_grad = False
model = model.to(device)
model.eval()
models.append(model)
print("models loaded")
def cleanText(text):
if not text:
return ""
text = str(text)
text = re.sub(r'<[^>]+>', '', text)
text = ' '.join(text.split())
return text
def cleanTokenize(text):
text = str(text).lower()
text = re.sub(r'[^a-z0-9\s]', '', text)
tokens = tokenizer.tokenize(text)
return tokens
def predict_review(text):
load_resources()
cleaned = cleanText(text)
tokens = cleanTokenize(cleaned)
if len(tokens) == 0:
return "invalid input", 0.0, "n/a"
indices = [vocab.get(token, vocab['<UNK>']) for token in tokens]
maxLen = 256
if len(indices) > maxLen:
indices = indices[:maxLen]
else:
indices = indices + [vocab['<PAD>']] * (maxLen - len(indices))
inpTensor = torch.LongTensor([indices]).to(device)
allOutputs = []
with torch.no_grad():
for model in models:
outputs = model(inpTensor)
probs = torch.softmax(outputs, dim=1)
allOutputs.append(probs.cpu().numpy())
avgProbs = np.mean(allOutputs, axis=0)[0]
fakeProb = avgProbs[1]
realProb = avgProbs[0]
confidence = max(fakeProb, realProb)
fakeThreshold = 0.75
realThreshold = 0.75
if fakeProb >= fakeThreshold:
prediction = "fake"
elif realProb >= realThreshold:
prediction = "real"
else:
prediction = "uncertain"
return prediction, float(confidence), f"fake: {fakeProb:.3f}, real: {realProb:.3f}"
demo = gr.Interface(
fn=predict_review,
inputs=gr.Textbox(
lines=5,
placeholder="paste review text here",
label="review text"
),
outputs=[
gr.Textbox(label="prediction"),
gr.Number(label="confidence"),
gr.Textbox(label="probabilities")
],
title="sentinelcheck",
description="fake review detector using ensemble lstm models (75% threshold)",
examples=[
["this product is absolutely amazing! i received it for free and it changed my life completely. five stars!"],
["decent quality for the price. took about a week to arrive. works as expected."]
]
)
if __name__ == "__main__":
demo.launch()