Spaces:

codingcoolfun9ed
/

sentinelcheck-api

Sleeping

App Files Files Community

main commited on Jan 9

Commit

02c45ef

0 Parent(s):

fresh deploy with external models

Browse files

Files changed (14) hide show

.gitattributes +3 -0
.gitignore +1 -0
FETCH_HEAD +0 -0
README.md +24 -0
api/.DS_Store +0 -0
api/__init__.py +0 -0
api/__pycache__/__init__.cpython-313.pyc +0 -0
api/__pycache__/predict.cpython-313.pyc +0 -0
api/app.py +45 -0
api/predict.py +115 -0
app.py +159 -0
data/processed/embedding_matrix.npy +3 -0
data/processed/vocab.pkl +3 -0
requirements.txt +11 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+*.pth filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ models/

FETCH_HEAD ADDED Viewed

File without changes

README.md ADDED Viewed

	@@ -0,0 +1,24 @@

+---
+title: sentinelcheck-api
+emoji: 🔍
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: "5.9.1"
+app_file: app.py
+pinned: false
+---
+# sentinelcheck - fake review detector
+uses ensemble of 5 bidirectional lstm models with glove embeddings to detect fake product reviews
+## how it works
+- paste a review into the text box
+- model analyzes the text
+- get prediction (fake/real), confidence score, and probabilities
+## tech stack
+- pytorch lstm models
+- glove 300d embeddings
+- gradio interface

api/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

api/__init__.py ADDED Viewed

File without changes

api/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (165 Bytes). View file

api/__pycache__/predict.cpython-313.pyc ADDED Viewed

Binary file (7.21 kB). View file

api/app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from api.predict import predict_review
+app = Flask(__name__)
+CORS(app)
+@app.route('/health', methods=['GET'])
+def health():
+    return jsonify({"status": "ok"}), 200
+@app.route('/predict', methods=['POST'])
+def predict():
+    try:
+        data = request.get_json()
+        if not data or 'text' not in data:
+            return jsonify({"error": "missing 'text' field"}), 400
+        reviewText = data['text']
+        if not isinstance(reviewText, str):
+            return jsonify({"error": "'text' must be a string"}), 400
+        if len(reviewText.strip()) == 0:
+            return jsonify({"error": "text cannot be empty"}), 400
+        result = predict_review(reviewText)
+        return jsonify({
+            "prediction": result['prediction'],
+            "confidence": result['confidence'],
+            "is_fake": result['is_fake']
+        }), 200
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+if __name__ == '__main__':
+    print("starting api server")
+    app.run(host='0.0.0.0', port=5000, debug=False)

api/predict.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+import numpy as np
+import re
+import os
+from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
+scriptDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+modelsDir = os.path.join(scriptDir, "models")
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+tokenizer = None
+models = None
+def load_resources():
+    global tokenizer, models
+    if tokenizer is not None and models is not None:
+        return
+    print("loading models...")
+    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
+    num_classes = 2
+    dropout = 0.4
+    models = []
+    for i in range(1, 6):
+        model = DistilBertForSequenceClassification.from_pretrained(
+            'distilbert-base-uncased',
+            num_labels=num_classes,
+            dropout=dropout
+        )
+        model.load_state_dict(torch.load(os.path.join(modelsDir, f"ensemble_model_{i}.pth"), map_location=device))
+        model = model.to(device)
+        model.eval()
+        models.append(model)
+    print("models loaded")
+def cleanText(text):
+    if not text:
+        return ""
+    text = str(text)
+    text = re.sub(r'<[^>]+>', '', text)
+    text = ' '.join(text.split())
+    text = text.lower()
+    text = text.strip()
+    return text
+def getLengthCategory(text):
+    words = text.split()
+    wordCount = len(words)
+    if wordCount <= 20:
+        return 'short'
+    elif wordCount <= 50:
+        return 'short-medium'
+    elif wordCount <= 100:
+        return 'medium'
+    elif wordCount <= 200:
+        return 'long'
+    else:
+        return 'very-long'
+def predict_review(text):
+    load_resources()
+    cleaned = cleanText(text)
+    if not cleaned:
+        return {
+            "prediction": "invalid",
+            "confidence": 0.0,
+            "is_fake": False,
+            "error": "empty text after preprocessing"
+        }
+    encoding = tokenizer(
+        cleaned,
+        truncation=True,
+        padding='max_length',
+        max_length=256,
+        return_tensors='pt'
+    )
+    input_ids = encoding['input_ids'].to(device)
+    attention_mask = encoding['attention_mask'].to(device)
+    allOutputs = []
+    with torch.no_grad():
+        for model in models:
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            probs = torch.softmax(outputs.logits, dim=1)
+            allOutputs.append(probs.cpu().numpy())
+    avgProbs = np.mean(allOutputs, axis=0)[0]
+    fakeProb = avgProbs[1]
+    realProb = avgProbs[0]
+    isFake = fakeProb > 0.5
+    confidence = max(fakeProb, realProb)
+    prediction = "fake" if isFake else "real"
+    if confidence < 0.75:
+        prediction = "uncertain"
+    lengthCat = getLengthCategory(cleaned)
+    return {
+        "prediction": prediction,
+        "confidence": float(confidence),
+        "is_fake": bool(isFake),
+        "length_category": lengthCat,
+        "token_count": len(cleaned.split())
+    }

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import numpy as np
+import pickle
+import re
+import os
+from nltk.tokenize.toktok import ToktokTokenizer
+class CoolLSTMClassifier(nn.Module):
+    def __init__(self, vocabSize, embeddingDim, dimHidden, layerAmt, num_classes=2, dropout=0.3):
+        super(CoolLSTMClassifier, self).__init__()
+        self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx=0)
+        self.embedding_dropout = nn.Dropout(0.3)
+        self.dimHidden = dimHidden
+        self.lstm = nn.LSTM(
+            embeddingDim,
+            dimHidden,
+            layerAmt,
+            batch_first=True,
+            bidirectional=True,
+            dropout=dropout if layerAmt > 1 else 0
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(dimHidden * 2, num_classes)
+    def forward(self, x):
+        embedded = self.embedding(x)
+        embedded = self.embedding_dropout(embedded)
+        lstm_out, (hidden, cell) = self.lstm(embedded)
+        forward_hidden = hidden[-2, :, :]
+        backward_hidden = hidden[-1, :, :]
+        combined = torch.cat([forward_hidden, backward_hidden], dim=1)
+        combined = self.dropout(combined)
+        output = self.fc(combined)
+        return output
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+tokenizer = ToktokTokenizer()
+vocab = None
+models = None
+embeddingMatrix = None
+def load_resources():
+    global vocab, models, embeddingMatrix
+    if vocab is not None and models is not None:
+        return
+    print("loading vocab and models...")
+    with open('data/processed/vocab.pkl', 'rb') as f:
+        vocab = pickle.load(f)
+    embeddingMatrix = np.load('data/processed/embedding_matrix.npy')
+    vocabSize = len(vocab)
+    embeddingDim = 300
+    dimHidden = 96
+    layerAmt = 1
+    num_classes = 2
+    dropout = 0.5
+    models = []
+    for i in range(1, 6):
+        model = CoolLSTMClassifier(vocabSize, embeddingDim, dimHidden, layerAmt, num_classes, dropout)
+        model.load_state_dict(torch.load(f'models/ensemble_model_{i}.pth', map_location=device))
+        model.embedding.weight.data.copy_(torch.from_numpy(embeddingMatrix))
+        model.embedding.weight.requires_grad = False
+        model = model.to(device)
+        model.eval()
+        models.append(model)
+    print("models loaded")
+def cleanText(text):
+    if not text:
+        return ""
+    text = str(text)
+    text = re.sub(r'<[^>]+>', '', text)
+    text = ' '.join(text.split())
+    return text
+def cleanTokenize(text):
+    text = str(text).lower()
+    text = re.sub(r'[^a-z0-9\s]', '', text)
+    tokens = tokenizer.tokenize(text)
+    return tokens
+def predict_review(text):
+    load_resources()
+    cleaned = cleanText(text)
+    tokens = cleanTokenize(cleaned)
+    if len(tokens) == 0:
+        return "invalid input", 0.0, "n/a"
+    indices = [vocab.get(token, vocab['<UNK>']) for token in tokens]
+    maxLen = 256
+    if len(indices) > maxLen:
+        indices = indices[:maxLen]
+    else:
+        indices = indices + [vocab['<PAD>']] * (maxLen - len(indices))
+    inpTensor = torch.LongTensor([indices]).to(device)
+    allOutputs = []
+    with torch.no_grad():
+        for model in models:
+            outputs = model(inpTensor)
+            probs = torch.softmax(outputs, dim=1)
+            allOutputs.append(probs.cpu().numpy())
+    avgProbs = np.mean(allOutputs, axis=0)[0]
+    fakeProb = avgProbs[1]
+    realProb = avgProbs[0]
+    confidence = max(fakeProb, realProb)
+    fakeThreshold = 0.75
+    realThreshold = 0.75
+    if fakeProb >= fakeThreshold:
+        prediction = "fake"
+    elif realProb >= realThreshold:
+        prediction = "real"
+    else:
+        prediction = "uncertain"
+    return prediction, float(confidence), f"fake: {fakeProb:.3f}, real: {realProb:.3f}"
+demo = gr.Interface(
+    fn=predict_review,
+    inputs=gr.Textbox(
+        lines=5,
+        placeholder="paste review text here",
+        label="review text"
+    ),
+    outputs=[
+        gr.Textbox(label="prediction"),
+        gr.Number(label="confidence"),
+        gr.Textbox(label="probabilities")
+    ],
+    title="sentinelcheck",
+    description="fake review detector using ensemble lstm models (75% threshold)",
+    examples=[
+        ["this product is absolutely amazing! i received it for free and it changed my life completely. five stars!"],
+        ["decent quality for the price. took about a week to arrive. works as expected."]
+    ]
+)
+if __name__ == "__main__":
+    demo.launch()

data/processed/embedding_matrix.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:891538e491fe64bd02d633a5a3dc47e2944224562a328a58feca3b18e3781740
+size 42703328

data/processed/vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a164af08da72faefa8b54b039ec55770295da074de819d9b6b02a9fca1798b18
+size 225374

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+flask
+flask-cors
+numpy
+pandas
+scikit-learn
+tensorflow
+keras
+nltk
+gunicorn
+torch
+huggingface_hub