Spaces:

codingcoolfun9ed
/

sentinelcheck-api

Running

main

fresh deploy with external models

02c45ef 14 days ago

4.61 kB

	import gradio as gr
	import torch
	import torch.nn as nn
	import numpy as np
	import pickle
	import re
	import os
	from nltk.tokenize.toktok import ToktokTokenizer

	class CoolLSTMClassifier(nn.Module):
	def __init__(self, vocabSize, embeddingDim, dimHidden, layerAmt, num_classes=2, dropout=0.3):
	super(CoolLSTMClassifier, self).__init__()

	self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx=0)
	self.embedding_dropout = nn.Dropout(0.3)
	self.dimHidden = dimHidden

	self.lstm = nn.LSTM(
	embeddingDim,
	dimHidden,
	layerAmt,
	batch_first=True,
	bidirectional=True,
	dropout=dropout if layerAmt > 1 else 0
	)

	self.dropout = nn.Dropout(dropout)
	self.fc = nn.Linear(dimHidden * 2, num_classes)

	def forward(self, x):
	embedded = self.embedding(x)
	embedded = self.embedding_dropout(embedded)
	lstm_out, (hidden, cell) = self.lstm(embedded)
	forward_hidden = hidden[-2, :, :]
	backward_hidden = hidden[-1, :, :]
	combined = torch.cat([forward_hidden, backward_hidden], dim=1)
	combined = self.dropout(combined)
	output = self.fc(combined)
	return output

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	tokenizer = ToktokTokenizer()

	vocab = None
	models = None
	embeddingMatrix = None

	def load_resources():
	global vocab, models, embeddingMatrix

	if vocab is not None and models is not None:
	return

	print("loading vocab and models...")

	with open('data/processed/vocab.pkl', 'rb') as f:
	vocab = pickle.load(f)

	embeddingMatrix = np.load('data/processed/embedding_matrix.npy')

	vocabSize = len(vocab)
	embeddingDim = 300
	dimHidden = 96
	layerAmt = 1
	num_classes = 2
	dropout = 0.5

	models = []
	for i in range(1, 6):
	model = CoolLSTMClassifier(vocabSize, embeddingDim, dimHidden, layerAmt, num_classes, dropout)
	model.load_state_dict(torch.load(f'models/ensemble_model_{i}.pth', map_location=device))
	model.embedding.weight.data.copy_(torch.from_numpy(embeddingMatrix))
	model.embedding.weight.requires_grad = False
	model = model.to(device)
	model.eval()
	models.append(model)

	print("models loaded")

	def cleanText(text):
	if not text:
	return ""
	text = str(text)
	text = re.sub(r'<[^>]+>', '', text)
	text = ' '.join(text.split())
	return text

	def cleanTokenize(text):
	text = str(text).lower()
	text = re.sub(r'[^a-z0-9\s]', '', text)
	tokens = tokenizer.tokenize(text)
	return tokens

	def predict_review(text):
	load_resources()

	cleaned = cleanText(text)
	tokens = cleanTokenize(cleaned)

	if len(tokens) == 0:
	return "invalid input", 0.0, "n/a"

	indices = [vocab.get(token, vocab['<UNK>']) for token in tokens]

	maxLen = 256
	if len(indices) > maxLen:
	indices = indices[:maxLen]
	else:
	indices = indices + [vocab['<PAD>']] * (maxLen - len(indices))

	inpTensor = torch.LongTensor([indices]).to(device)

	allOutputs = []
	with torch.no_grad():
	for model in models:
	outputs = model(inpTensor)
	probs = torch.softmax(outputs, dim=1)
	allOutputs.append(probs.cpu().numpy())

	avgProbs = np.mean(allOutputs, axis=0)[0]
	fakeProb = avgProbs[1]
	realProb = avgProbs[0]

	confidence = max(fakeProb, realProb)

	fakeThreshold = 0.75
	realThreshold = 0.75

	if fakeProb >= fakeThreshold:
	prediction = "fake"
	elif realProb >= realThreshold:
	prediction = "real"
	else:
	prediction = "uncertain"

	return prediction, float(confidence), f"fake: {fakeProb:.3f}, real: {realProb:.3f}"

	demo = gr.Interface(
	fn=predict_review,
	inputs=gr.Textbox(
	lines=5,
	placeholder="paste review text here",
	label="review text"
	),
	outputs=[
	gr.Textbox(label="prediction"),
	gr.Number(label="confidence"),
	gr.Textbox(label="probabilities")
	],
	title="sentinelcheck",
	description="fake review detector using ensemble lstm models (75% threshold)",
	examples=[
	["this product is absolutely amazing! i received it for free and it changed my life completely. five stars!"],
	["decent quality for the price. took about a week to arrive. works as expected."]
	]
	)

	if __name__ == "__main__":
	demo.launch()