main commited on
Commit
02c45ef
·
0 Parent(s):

fresh deploy with external models

Browse files
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.pth filter=lfs diff=lfs merge=lfs -text
2
+ *.npy filter=lfs diff=lfs merge=lfs -text
3
+ *.pkl filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ models/
FETCH_HEAD ADDED
File without changes
README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: sentinelcheck-api
3
+ emoji: 🔍
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: "5.9.1"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # sentinelcheck - fake review detector
13
+
14
+ uses ensemble of 5 bidirectional lstm models with glove embeddings to detect fake product reviews
15
+
16
+ ## how it works
17
+ - paste a review into the text box
18
+ - model analyzes the text
19
+ - get prediction (fake/real), confidence score, and probabilities
20
+
21
+ ## tech stack
22
+ - pytorch lstm models
23
+ - glove 300d embeddings
24
+ - gradio interface
api/.DS_Store ADDED
Binary file (6.15 kB). View file
 
api/__init__.py ADDED
File without changes
api/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (165 Bytes). View file
 
api/__pycache__/predict.cpython-313.pyc ADDED
Binary file (7.21 kB). View file
 
api/app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from flask_cors import CORS
3
+ import os
4
+ import sys
5
+
6
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
+ from api.predict import predict_review
8
+
9
+ app = Flask(__name__)
10
+ CORS(app)
11
+
12
+ @app.route('/health', methods=['GET'])
13
+ def health():
14
+ return jsonify({"status": "ok"}), 200
15
+
16
+ @app.route('/predict', methods=['POST'])
17
+ def predict():
18
+ try:
19
+ data = request.get_json()
20
+
21
+ if not data or 'text' not in data:
22
+ return jsonify({"error": "missing 'text' field"}), 400
23
+
24
+ reviewText = data['text']
25
+
26
+ if not isinstance(reviewText, str):
27
+ return jsonify({"error": "'text' must be a string"}), 400
28
+
29
+ if len(reviewText.strip()) == 0:
30
+ return jsonify({"error": "text cannot be empty"}), 400
31
+
32
+ result = predict_review(reviewText)
33
+
34
+ return jsonify({
35
+ "prediction": result['prediction'],
36
+ "confidence": result['confidence'],
37
+ "is_fake": result['is_fake']
38
+ }), 200
39
+
40
+ except Exception as e:
41
+ return jsonify({"error": str(e)}), 500
42
+
43
+ if __name__ == '__main__':
44
+ print("starting api server")
45
+ app.run(host='0.0.0.0', port=5000, debug=False)
api/predict.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import re
4
+ import os
5
+ from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
6
+
7
+ scriptDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8
+ modelsDir = os.path.join(scriptDir, "models")
9
+
10
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
11
+ tokenizer = None
12
+ models = None
13
+
14
+ def load_resources():
15
+ global tokenizer, models
16
+
17
+ if tokenizer is not None and models is not None:
18
+ return
19
+
20
+ print("loading models...")
21
+
22
+ tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
23
+
24
+ num_classes = 2
25
+ dropout = 0.4
26
+
27
+ models = []
28
+ for i in range(1, 6):
29
+ model = DistilBertForSequenceClassification.from_pretrained(
30
+ 'distilbert-base-uncased',
31
+ num_labels=num_classes,
32
+ dropout=dropout
33
+ )
34
+ model.load_state_dict(torch.load(os.path.join(modelsDir, f"ensemble_model_{i}.pth"), map_location=device))
35
+ model = model.to(device)
36
+ model.eval()
37
+ models.append(model)
38
+
39
+ print("models loaded")
40
+
41
+ def cleanText(text):
42
+ if not text:
43
+ return ""
44
+ text = str(text)
45
+ text = re.sub(r'<[^>]+>', '', text)
46
+ text = ' '.join(text.split())
47
+ text = text.lower()
48
+ text = text.strip()
49
+ return text
50
+
51
+ def getLengthCategory(text):
52
+ words = text.split()
53
+ wordCount = len(words)
54
+ if wordCount <= 20:
55
+ return 'short'
56
+ elif wordCount <= 50:
57
+ return 'short-medium'
58
+ elif wordCount <= 100:
59
+ return 'medium'
60
+ elif wordCount <= 200:
61
+ return 'long'
62
+ else:
63
+ return 'very-long'
64
+
65
+ def predict_review(text):
66
+ load_resources()
67
+
68
+ cleaned = cleanText(text)
69
+
70
+ if not cleaned:
71
+ return {
72
+ "prediction": "invalid",
73
+ "confidence": 0.0,
74
+ "is_fake": False,
75
+ "error": "empty text after preprocessing"
76
+ }
77
+
78
+ encoding = tokenizer(
79
+ cleaned,
80
+ truncation=True,
81
+ padding='max_length',
82
+ max_length=256,
83
+ return_tensors='pt'
84
+ )
85
+
86
+ input_ids = encoding['input_ids'].to(device)
87
+ attention_mask = encoding['attention_mask'].to(device)
88
+
89
+ allOutputs = []
90
+ with torch.no_grad():
91
+ for model in models:
92
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
93
+ probs = torch.softmax(outputs.logits, dim=1)
94
+ allOutputs.append(probs.cpu().numpy())
95
+
96
+ avgProbs = np.mean(allOutputs, axis=0)[0]
97
+ fakeProb = avgProbs[1]
98
+ realProb = avgProbs[0]
99
+
100
+ isFake = fakeProb > 0.5
101
+ confidence = max(fakeProb, realProb)
102
+ prediction = "fake" if isFake else "real"
103
+
104
+ if confidence < 0.75:
105
+ prediction = "uncertain"
106
+
107
+ lengthCat = getLengthCategory(cleaned)
108
+
109
+ return {
110
+ "prediction": prediction,
111
+ "confidence": float(confidence),
112
+ "is_fake": bool(isFake),
113
+ "length_category": lengthCat,
114
+ "token_count": len(cleaned.split())
115
+ }
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torch.nn as nn
4
+ import numpy as np
5
+ import pickle
6
+ import re
7
+ import os
8
+ from nltk.tokenize.toktok import ToktokTokenizer
9
+
10
+ class CoolLSTMClassifier(nn.Module):
11
+ def __init__(self, vocabSize, embeddingDim, dimHidden, layerAmt, num_classes=2, dropout=0.3):
12
+ super(CoolLSTMClassifier, self).__init__()
13
+
14
+ self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx=0)
15
+ self.embedding_dropout = nn.Dropout(0.3)
16
+ self.dimHidden = dimHidden
17
+
18
+ self.lstm = nn.LSTM(
19
+ embeddingDim,
20
+ dimHidden,
21
+ layerAmt,
22
+ batch_first=True,
23
+ bidirectional=True,
24
+ dropout=dropout if layerAmt > 1 else 0
25
+ )
26
+
27
+ self.dropout = nn.Dropout(dropout)
28
+ self.fc = nn.Linear(dimHidden * 2, num_classes)
29
+
30
+ def forward(self, x):
31
+ embedded = self.embedding(x)
32
+ embedded = self.embedding_dropout(embedded)
33
+ lstm_out, (hidden, cell) = self.lstm(embedded)
34
+ forward_hidden = hidden[-2, :, :]
35
+ backward_hidden = hidden[-1, :, :]
36
+ combined = torch.cat([forward_hidden, backward_hidden], dim=1)
37
+ combined = self.dropout(combined)
38
+ output = self.fc(combined)
39
+ return output
40
+
41
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
42
+ tokenizer = ToktokTokenizer()
43
+
44
+ vocab = None
45
+ models = None
46
+ embeddingMatrix = None
47
+
48
+ def load_resources():
49
+ global vocab, models, embeddingMatrix
50
+
51
+ if vocab is not None and models is not None:
52
+ return
53
+
54
+ print("loading vocab and models...")
55
+
56
+ with open('data/processed/vocab.pkl', 'rb') as f:
57
+ vocab = pickle.load(f)
58
+
59
+ embeddingMatrix = np.load('data/processed/embedding_matrix.npy')
60
+
61
+ vocabSize = len(vocab)
62
+ embeddingDim = 300
63
+ dimHidden = 96
64
+ layerAmt = 1
65
+ num_classes = 2
66
+ dropout = 0.5
67
+
68
+ models = []
69
+ for i in range(1, 6):
70
+ model = CoolLSTMClassifier(vocabSize, embeddingDim, dimHidden, layerAmt, num_classes, dropout)
71
+ model.load_state_dict(torch.load(f'models/ensemble_model_{i}.pth', map_location=device))
72
+ model.embedding.weight.data.copy_(torch.from_numpy(embeddingMatrix))
73
+ model.embedding.weight.requires_grad = False
74
+ model = model.to(device)
75
+ model.eval()
76
+ models.append(model)
77
+
78
+ print("models loaded")
79
+
80
+ def cleanText(text):
81
+ if not text:
82
+ return ""
83
+ text = str(text)
84
+ text = re.sub(r'<[^>]+>', '', text)
85
+ text = ' '.join(text.split())
86
+ return text
87
+
88
+ def cleanTokenize(text):
89
+ text = str(text).lower()
90
+ text = re.sub(r'[^a-z0-9\s]', '', text)
91
+ tokens = tokenizer.tokenize(text)
92
+ return tokens
93
+
94
+ def predict_review(text):
95
+ load_resources()
96
+
97
+ cleaned = cleanText(text)
98
+ tokens = cleanTokenize(cleaned)
99
+
100
+ if len(tokens) == 0:
101
+ return "invalid input", 0.0, "n/a"
102
+
103
+ indices = [vocab.get(token, vocab['<UNK>']) for token in tokens]
104
+
105
+ maxLen = 256
106
+ if len(indices) > maxLen:
107
+ indices = indices[:maxLen]
108
+ else:
109
+ indices = indices + [vocab['<PAD>']] * (maxLen - len(indices))
110
+
111
+ inpTensor = torch.LongTensor([indices]).to(device)
112
+
113
+ allOutputs = []
114
+ with torch.no_grad():
115
+ for model in models:
116
+ outputs = model(inpTensor)
117
+ probs = torch.softmax(outputs, dim=1)
118
+ allOutputs.append(probs.cpu().numpy())
119
+
120
+ avgProbs = np.mean(allOutputs, axis=0)[0]
121
+ fakeProb = avgProbs[1]
122
+ realProb = avgProbs[0]
123
+
124
+ confidence = max(fakeProb, realProb)
125
+
126
+ fakeThreshold = 0.75
127
+ realThreshold = 0.75
128
+
129
+ if fakeProb >= fakeThreshold:
130
+ prediction = "fake"
131
+ elif realProb >= realThreshold:
132
+ prediction = "real"
133
+ else:
134
+ prediction = "uncertain"
135
+
136
+ return prediction, float(confidence), f"fake: {fakeProb:.3f}, real: {realProb:.3f}"
137
+
138
+ demo = gr.Interface(
139
+ fn=predict_review,
140
+ inputs=gr.Textbox(
141
+ lines=5,
142
+ placeholder="paste review text here",
143
+ label="review text"
144
+ ),
145
+ outputs=[
146
+ gr.Textbox(label="prediction"),
147
+ gr.Number(label="confidence"),
148
+ gr.Textbox(label="probabilities")
149
+ ],
150
+ title="sentinelcheck",
151
+ description="fake review detector using ensemble lstm models (75% threshold)",
152
+ examples=[
153
+ ["this product is absolutely amazing! i received it for free and it changed my life completely. five stars!"],
154
+ ["decent quality for the price. took about a week to arrive. works as expected."]
155
+ ]
156
+ )
157
+
158
+ if __name__ == "__main__":
159
+ demo.launch()
data/processed/embedding_matrix.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:891538e491fe64bd02d633a5a3dc47e2944224562a328a58feca3b18e3781740
3
+ size 42703328
data/processed/vocab.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a164af08da72faefa8b54b039ec55770295da074de819d9b6b02a9fca1798b18
3
+ size 225374
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ flask-cors
3
+ numpy
4
+ pandas
5
+ scikit-learn
6
+ tensorflow
7
+ keras
8
+ nltk
9
+ gunicorn
10
+ torch
11
+ huggingface_hub