Spaces:

codingcoolfun9ed
/

sentinelcheck-api

Running

File size: 4,611 Bytes

02c45ef

import gradio as gr
import torch
import torch.nn as nn
import numpy as np
import pickle
import re
import os
from nltk.tokenize.toktok import ToktokTokenizer

class CoolLSTMClassifier(nn.Module):
    def __init__(self, vocabSize, embeddingDim, dimHidden, layerAmt, num_classes=2, dropout=0.3):
        super(CoolLSTMClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.3)
        self.dimHidden = dimHidden
        
        self.lstm = nn.LSTM(
            embeddingDim, 
            dimHidden, 
            layerAmt, 
            batch_first=True,
            bidirectional=True,
            dropout=dropout if layerAmt > 1 else 0
        )
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(dimHidden * 2, num_classes)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        forward_hidden = hidden[-2, :, :]
        backward_hidden = hidden[-1, :, :]
        combined = torch.cat([forward_hidden, backward_hidden], dim=1)
        combined = self.dropout(combined)
        output = self.fc(combined)
        return output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = ToktokTokenizer()

vocab = None
models = None
embeddingMatrix = None

def load_resources():
    global vocab, models, embeddingMatrix
    
    if vocab is not None and models is not None:
        return
    
    print("loading vocab and models...")
    
    with open('data/processed/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    
    embeddingMatrix = np.load('data/processed/embedding_matrix.npy')
    
    vocabSize = len(vocab)
    embeddingDim = 300
    dimHidden = 96
    layerAmt = 1
    num_classes = 2
    dropout = 0.5
    
    models = []
    for i in range(1, 6):
        model = CoolLSTMClassifier(vocabSize, embeddingDim, dimHidden, layerAmt, num_classes, dropout)
        model.load_state_dict(torch.load(f'models/ensemble_model_{i}.pth', map_location=device))
        model.embedding.weight.data.copy_(torch.from_numpy(embeddingMatrix))
        model.embedding.weight.requires_grad = False
        model = model.to(device)
        model.eval()
        models.append(model)
    
    print("models loaded")

def cleanText(text):
    if not text:
        return ""
    text = str(text)
    text = re.sub(r'<[^>]+>', '', text)
    text = ' '.join(text.split())
    return text

def cleanTokenize(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = tokenizer.tokenize(text)
    return tokens

def predict_review(text):
    load_resources()
    
    cleaned = cleanText(text)
    tokens = cleanTokenize(cleaned)
    
    if len(tokens) == 0:
        return "invalid input", 0.0, "n/a"
    
    indices = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    
    maxLen = 256
    if len(indices) > maxLen:
        indices = indices[:maxLen]
    else:
        indices = indices + [vocab['<PAD>']] * (maxLen - len(indices))
    
    inpTensor = torch.LongTensor([indices]).to(device)
    
    allOutputs = []
    with torch.no_grad():
        for model in models:
            outputs = model(inpTensor)
            probs = torch.softmax(outputs, dim=1)
            allOutputs.append(probs.cpu().numpy())
    
    avgProbs = np.mean(allOutputs, axis=0)[0]
    fakeProb = avgProbs[1]
    realProb = avgProbs[0]
    
    confidence = max(fakeProb, realProb)
    
    fakeThreshold = 0.75
    realThreshold = 0.75
    
    if fakeProb >= fakeThreshold:
        prediction = "fake"
    elif realProb >= realThreshold:
        prediction = "real"
    else:
        prediction = "uncertain"
    
    return prediction, float(confidence), f"fake: {fakeProb:.3f}, real: {realProb:.3f}"

demo = gr.Interface(
    fn=predict_review,
    inputs=gr.Textbox(
        lines=5,
        placeholder="paste review text here",
        label="review text"
    ),
    outputs=[
        gr.Textbox(label="prediction"),
        gr.Number(label="confidence"),
        gr.Textbox(label="probabilities")
    ],
    title="sentinelcheck",
    description="fake review detector using ensemble lstm models (75% threshold)",
    examples=[
        ["this product is absolutely amazing! i received it for free and it changed my life completely. five stars!"],
        ["decent quality for the price. took about a week to arrive. works as expected."]
    ]
)

if __name__ == "__main__":
    demo.launch()