import gradio as gr import torch import torch.nn as nn import numpy as np import pickle import re import os from nltk.tokenize.toktok import ToktokTokenizer class CoolLSTMClassifier(nn.Module): def __init__(self, vocabSize, embeddingDim, dimHidden, layerAmt, num_classes=2, dropout=0.3): super(CoolLSTMClassifier, self).__init__() self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx=0) self.embedding_dropout = nn.Dropout(0.3) self.dimHidden = dimHidden self.lstm = nn.LSTM( embeddingDim, dimHidden, layerAmt, batch_first=True, bidirectional=True, dropout=dropout if layerAmt > 1 else 0 ) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(dimHidden * 2, num_classes) def forward(self, x): embedded = self.embedding(x) embedded = self.embedding_dropout(embedded) lstm_out, (hidden, cell) = self.lstm(embedded) forward_hidden = hidden[-2, :, :] backward_hidden = hidden[-1, :, :] combined = torch.cat([forward_hidden, backward_hidden], dim=1) combined = self.dropout(combined) output = self.fc(combined) return output device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = ToktokTokenizer() vocab = None models = None embeddingMatrix = None def load_resources(): global vocab, models, embeddingMatrix if vocab is not None and models is not None: return print("loading vocab and models...") with open('data/processed/vocab.pkl', 'rb') as f: vocab = pickle.load(f) embeddingMatrix = np.load('data/processed/embedding_matrix.npy') vocabSize = len(vocab) embeddingDim = 300 dimHidden = 96 layerAmt = 1 num_classes = 2 dropout = 0.5 models = [] for i in range(1, 6): model = CoolLSTMClassifier(vocabSize, embeddingDim, dimHidden, layerAmt, num_classes, dropout) model.load_state_dict(torch.load(f'models/ensemble_model_{i}.pth', map_location=device)) model.embedding.weight.data.copy_(torch.from_numpy(embeddingMatrix)) model.embedding.weight.requires_grad = False model = model.to(device) model.eval() models.append(model) print("models loaded") def cleanText(text): if not text: return "" text = str(text) text = re.sub(r'<[^>]+>', '', text) text = ' '.join(text.split()) return text def cleanTokenize(text): text = str(text).lower() text = re.sub(r'[^a-z0-9\s]', '', text) tokens = tokenizer.tokenize(text) return tokens def predict_review(text): load_resources() cleaned = cleanText(text) tokens = cleanTokenize(cleaned) if len(tokens) == 0: return "invalid input", 0.0, "n/a" indices = [vocab.get(token, vocab['']) for token in tokens] maxLen = 256 if len(indices) > maxLen: indices = indices[:maxLen] else: indices = indices + [vocab['']] * (maxLen - len(indices)) inpTensor = torch.LongTensor([indices]).to(device) allOutputs = [] with torch.no_grad(): for model in models: outputs = model(inpTensor) probs = torch.softmax(outputs, dim=1) allOutputs.append(probs.cpu().numpy()) avgProbs = np.mean(allOutputs, axis=0)[0] fakeProb = avgProbs[1] realProb = avgProbs[0] confidence = max(fakeProb, realProb) fakeThreshold = 0.75 realThreshold = 0.75 if fakeProb >= fakeThreshold: prediction = "fake" elif realProb >= realThreshold: prediction = "real" else: prediction = "uncertain" return prediction, float(confidence), f"fake: {fakeProb:.3f}, real: {realProb:.3f}" demo = gr.Interface( fn=predict_review, inputs=gr.Textbox( lines=5, placeholder="paste review text here", label="review text" ), outputs=[ gr.Textbox(label="prediction"), gr.Number(label="confidence"), gr.Textbox(label="probabilities") ], title="sentinelcheck", description="fake review detector using ensemble lstm models (75% threshold)", examples=[ ["this product is absolutely amazing! i received it for free and it changed my life completely. five stars!"], ["decent quality for the price. took about a week to arrive. works as expected."] ] ) if __name__ == "__main__": demo.launch()