import torch import numpy as np import re import os from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast scriptDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) modelsDir = os.path.join(scriptDir, "models") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = None models = None def load_resources(): global tokenizer, models if tokenizer is not None and models is not None: return print("loading models...") tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') num_classes = 2 dropout = 0.4 models = [] for i in range(1, 6): model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=num_classes, dropout=dropout ) model.load_state_dict(torch.load(os.path.join(modelsDir, f"ensemble_model_{i}.pth"), map_location=device)) model = model.to(device) model.eval() models.append(model) print("models loaded") def cleanText(text): if not text: return "" text = str(text) text = re.sub(r'<[^>]+>', '', text) text = ' '.join(text.split()) text = text.lower() text = text.strip() return text def getLengthCategory(text): words = text.split() wordCount = len(words) if wordCount <= 20: return 'short' elif wordCount <= 50: return 'short-medium' elif wordCount <= 100: return 'medium' elif wordCount <= 200: return 'long' else: return 'very-long' def predict_review(text): load_resources() cleaned = cleanText(text) if not cleaned: return { "prediction": "invalid", "confidence": 0.0, "is_fake": False, "error": "empty text after preprocessing" } encoding = tokenizer( cleaned, truncation=True, padding='max_length', max_length=256, return_tensors='pt' ) input_ids = encoding['input_ids'].to(device) attention_mask = encoding['attention_mask'].to(device) allOutputs = [] with torch.no_grad(): for model in models: outputs = model(input_ids=input_ids, attention_mask=attention_mask) probs = torch.softmax(outputs.logits, dim=1) allOutputs.append(probs.cpu().numpy()) avgProbs = np.mean(allOutputs, axis=0)[0] fakeProb = avgProbs[1] realProb = avgProbs[0] isFake = fakeProb > 0.5 confidence = max(fakeProb, realProb) prediction = "fake" if isFake else "real" if confidence < 0.75: prediction = "uncertain" lengthCat = getLengthCategory(cleaned) return { "prediction": prediction, "confidence": float(confidence), "is_fake": bool(isFake), "length_category": lengthCat, "token_count": len(cleaned.split()) }