| import gradio as gr | |
| import torch | |
| import torch.nn as nn | |
| import numpy as np | |
| import pickle | |
| import re | |
| import os | |
| from nltk.tokenize.toktok import ToktokTokenizer | |
| class CoolLSTMClassifier(nn.Module): | |
| def __init__(self, vocabSize, embeddingDim, dimHidden, layerAmt, num_classes=2, dropout=0.3): | |
| super(CoolLSTMClassifier, self).__init__() | |
| self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx=0) | |
| self.embedding_dropout = nn.Dropout(0.3) | |
| self.dimHidden = dimHidden | |
| self.lstm = nn.LSTM( | |
| embeddingDim, | |
| dimHidden, | |
| layerAmt, | |
| batch_first=True, | |
| bidirectional=True, | |
| dropout=dropout if layerAmt > 1 else 0 | |
| ) | |
| self.dropout = nn.Dropout(dropout) | |
| self.fc = nn.Linear(dimHidden * 2, num_classes) | |
| def forward(self, x): | |
| embedded = self.embedding(x) | |
| embedded = self.embedding_dropout(embedded) | |
| lstm_out, (hidden, cell) = self.lstm(embedded) | |
| forward_hidden = hidden[-2, :, :] | |
| backward_hidden = hidden[-1, :, :] | |
| combined = torch.cat([forward_hidden, backward_hidden], dim=1) | |
| combined = self.dropout(combined) | |
| output = self.fc(combined) | |
| return output | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| tokenizer = ToktokTokenizer() | |
| vocab = None | |
| models = None | |
| embeddingMatrix = None | |
| def load_resources(): | |
| global vocab, models, embeddingMatrix | |
| if vocab is not None and models is not None: | |
| return | |
| print("loading vocab and models...") | |
| with open('data/processed/vocab.pkl', 'rb') as f: | |
| vocab = pickle.load(f) | |
| embeddingMatrix = np.load('data/processed/embedding_matrix.npy') | |
| vocabSize = len(vocab) | |
| embeddingDim = 300 | |
| dimHidden = 96 | |
| layerAmt = 1 | |
| num_classes = 2 | |
| dropout = 0.5 | |
| models = [] | |
| for i in range(1, 6): | |
| model = CoolLSTMClassifier(vocabSize, embeddingDim, dimHidden, layerAmt, num_classes, dropout) | |
| model.load_state_dict(torch.load(f'models/ensemble_model_{i}.pth', map_location=device)) | |
| model.embedding.weight.data.copy_(torch.from_numpy(embeddingMatrix)) | |
| model.embedding.weight.requires_grad = False | |
| model = model.to(device) | |
| model.eval() | |
| models.append(model) | |
| print("models loaded") | |
| def cleanText(text): | |
| if not text: | |
| return "" | |
| text = str(text) | |
| text = re.sub(r'<[^>]+>', '', text) | |
| text = ' '.join(text.split()) | |
| return text | |
| def cleanTokenize(text): | |
| text = str(text).lower() | |
| text = re.sub(r'[^a-z0-9\s]', '', text) | |
| tokens = tokenizer.tokenize(text) | |
| return tokens | |
| def predict_review(text): | |
| load_resources() | |
| cleaned = cleanText(text) | |
| tokens = cleanTokenize(cleaned) | |
| if len(tokens) == 0: | |
| return "invalid input", 0.0, "n/a" | |
| indices = [vocab.get(token, vocab['<UNK>']) for token in tokens] | |
| maxLen = 256 | |
| if len(indices) > maxLen: | |
| indices = indices[:maxLen] | |
| else: | |
| indices = indices + [vocab['<PAD>']] * (maxLen - len(indices)) | |
| inpTensor = torch.LongTensor([indices]).to(device) | |
| allOutputs = [] | |
| with torch.no_grad(): | |
| for model in models: | |
| outputs = model(inpTensor) | |
| probs = torch.softmax(outputs, dim=1) | |
| allOutputs.append(probs.cpu().numpy()) | |
| avgProbs = np.mean(allOutputs, axis=0)[0] | |
| fakeProb = avgProbs[1] | |
| realProb = avgProbs[0] | |
| confidence = max(fakeProb, realProb) | |
| fakeThreshold = 0.75 | |
| realThreshold = 0.75 | |
| if fakeProb >= fakeThreshold: | |
| prediction = "fake" | |
| elif realProb >= realThreshold: | |
| prediction = "real" | |
| else: | |
| prediction = "uncertain" | |
| return prediction, float(confidence), f"fake: {fakeProb:.3f}, real: {realProb:.3f}" | |
| demo = gr.Interface( | |
| fn=predict_review, | |
| inputs=gr.Textbox( | |
| lines=5, | |
| placeholder="paste review text here", | |
| label="review text" | |
| ), | |
| outputs=[ | |
| gr.Textbox(label="prediction"), | |
| gr.Number(label="confidence"), | |
| gr.Textbox(label="probabilities") | |
| ], | |
| title="sentinelcheck", | |
| description="fake review detector using ensemble lstm models (75% threshold)", | |
| examples=[ | |
| ["this product is absolutely amazing! i received it for free and it changed my life completely. five stars!"], | |
| ["decent quality for the price. took about a week to arrive. works as expected."] | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |