Spaces:

codingcoolfun9ed
/

sentinelcheck-api

Sleeping

App Files Files Community

codingcoolfun9ed commited on Jan 18

Commit

d31cc65

verified ·

1 Parent(s): 8eec530

updating this for the new esnemble for the final version AHHHH

Browse files

Files changed (1) hide show

api/predict.py +231 -61

api/predict.py CHANGED Viewed

@@ -1,51 +1,201 @@
 import torch
 import numpy as np
 import re
-from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
 from huggingface_hub import hf_hub_download
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-tokenizer = None
-model = None
-def load_resources():
-    global tokenizer, model
-    if tokenizer is not None and model is not None:
         return
-    print("loading model...")
-    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
-    print("downloading model_2.pth...")
-    modelPath = hf_hub_download(
-        repo_id="codingcoolfun9ed/sentinelcheck-models",
-        filename="model_2.pth"
-    )
-    model = DistilBertForSequenceClassification.from_pretrained(
-        'distilbert-base-uncased',
-        num_labels=2,
-        dropout=0.4
-    )
-    model.load_state_dict(torch.load(modelPath, map_location=device))
-    model = model.to(device)
-    model.eval()
-    print("model loaded")
-def cleanText(text):
-    if not text:
-        return ""
-    text = str(text)
-    text = re.sub(r'<[^>]+>', '', text)
-    text = ' '.join(text.split())
-    text = text.lower()
-    text = text.strip()
-    return text
 def getLengthCategory(text):
     words = text.split()
     wordCount = len(words)
     if wordCount <= 20:
@@ -60,53 +210,73 @@ def getLengthCategory(text):
         return 'very-long'
 def predict_review(text):
-    load_resources()
-    cleaned = cleanText(text)
-    if not cleaned:
         return {
-            "prediction": "invalid",
             "confidence": 0.0,
-            "is_fake": False,
             "error": "empty text after preprocessing"
         }
-    encoding = tokenizer(
-        cleaned,
-        truncation=True,
-        padding='max_length',
-        max_length=256,
-        return_tensors='pt'
-    )
-    inputIds = encoding['input_ids'].to(device)
-    attentionMask = encoding['attention_mask'].to(device)
-    with torch.no_grad():
-        outputs = model(input_ids=inputIds, attention_mask=attentionMask)
-        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
-    fakeProb = probs[1]
-    realProb = probs[0]
-    confidence = max(fakeProb, realProb)
-    if confidence < 0.75:
         prediction = "uncertain"
-        isFake = None
     else:
-        isFake = fakeProb > realProb
-        prediction = "fake" if isFake else "real"
     lengthCat = getLengthCategory(cleaned)
     return {
         "prediction": prediction,
         "confidence": float(confidence),
-        "is_fake": isFake,
-        "length_category": lengthCat,
-        "token_count": len(cleaned.split()),
         "fake_probability": float(fakeProb),
-        "real_probability": float(realProb)
     }

 import torch
 import numpy as np
 import re
+from transformers import (
+    DistilBertTokenizer, DistilBertForSequenceClassification,
+    RobertaTokenizer, RobertaForSequenceClassification,
+    BertTokenizer, BertForSequenceClassification
+)
 from huggingface_hub import hf_hub_download
+import gc
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+models = []
+tokenizers = []
+maxLengths = []
+modelWeights = [0.333, 0.333, 0.333]
+optimalThreshold = 0.45
+uncertaintyThreshold = 0.67
+CLASS_NAMES = ['genuine', 'fake']
+def validateText(text):
+    if not isinstance(text, str):
+        return False
+    text = text.strip()
+    return len(text) > 0 and len(text.split()) > 0
+def cleanReview(text):
+    if not text or not isinstance(text, str):
+        return ""
+    text = re.sub(r'http\S+|www\.\S+', '', text)
+    text = re.sub(r'<[^>]+>', '', text)
+    text = re.sub(r'([!?.])\1+', r'\1', text)
+    text = ' '.join(text.split())
+    return text.strip()
+def loadResources():
+    global models, tokenizers, maxLengths
+    if len(models) > 0:
         return
+    print("loading ensemble models...", flush=True)
+    modelConfigs = [
+        {
+            'filename': 'ensemble_model_1.pth',
+            'type': 'distilbert',
+            'name': 'distilbert-base-uncased',
+            'maxLen': 128
+        },
+        {
+            'filename': 'ensemble_model_2.pth',
+            'type': 'roberta',
+            'name': 'roberta-base',
+            'maxLen': 192
+        },
+        {
+            'filename': 'ensemble_model_3.pth',
+            'type': 'bert',
+            'name': 'bert-base-uncased',
+            'maxLen': 256
+        }
+    ]
+    for i, config in enumerate(modelConfigs, 1):
+        try:
+            print(f"loading model {i}: {config['type']}", flush=True)
+            modelPath = hf_hub_download(
+                repo_id="codingcoolfun9ed/sentinelcheck-models",
+                filename=config['filename']
+            )
+            if config['type'] == 'distilbert':
+                tokenizer = DistilBertTokenizer.from_pretrained(config['name'])
+                model = DistilBertForSequenceClassification.from_pretrained(
+                    config['name'],
+                    num_labels=2
+                )
+            elif config['type'] == 'roberta':
+                tokenizer = RobertaTokenizer.from_pretrained(config['name'])
+                model = RobertaForSequenceClassification.from_pretrained(
+                    config['name'],
+                    num_labels=2
+                )
+            elif config['type'] == 'bert':
+                tokenizer = BertTokenizer.from_pretrained(config['name'])
+                model = BertForSequenceClassification.from_pretrained(
+                    config['name'],
+                    num_labels=2
+                )
+            else:
+                raise ValueError(f"unknown model type: {config['type']}")
+            checkpoint = torch.load(modelPath, map_location=device, weights_only=False)
+            if 'state_dict' not in checkpoint:
+                raise ValueError(f"model {i} missing state_dict")
+            model.load_state_dict(checkpoint['state_dict'], strict=False)
+            model = model.to(device)
+            model.eval()
+            for param in model.parameters():
+                param.requires_grad = False
+            models.append(model)
+            tokenizers.append(tokenizer)
+            maxLengths.append(config['maxLen'])
+            del checkpoint
+            gc.collect()
+            print(f"model {i} loaded successfully", flush=True)
+        except Exception as e:
+            print(f"error loading model {i}: {str(e)}", flush=True)
+            raise
+    print("all ensemble models loaded", flush=True)
+def ensemblePredict(text):
+    loadResources()
+    if not isinstance(text, str):
+        text = str(text)
+    text = cleanReview(text)
+    if not validateText(text):
+        return {
+            'fakeProb': 0.5,
+            'genuineProb': 0.5,
+            'isFake': None,
+            'agreement': 0.0,
+            'error': 'invalid_text'
+        }
+    weightedProbs = torch.zeros(1, 2).to(device)
+    allPreds = []
+    try:
+        with torch.no_grad():
+            for tokenizer, model, maxLen, weight in zip(tokenizers, models, maxLengths, modelWeights):
+                inputs = tokenizer(
+                    text,
+                    return_tensors='pt',
+                    truncation=True,
+                    max_length=maxLen,
+                    padding='max_length'
+                )
+                inputIds = inputs['input_ids'].to(device)
+                attentionMask = inputs['attention_mask'].to(device)
+                outputs = model(input_ids=inputIds, attention_mask=attentionMask)
+                probs = torch.softmax(outputs.logits, dim=1)
+                weightedProbs += probs * weight
+                _, pred = torch.max(probs, 1)
+                allPreds.append(pred.item())
+                del inputs, inputIds, attentionMask, outputs, probs, pred
+        probs = weightedProbs[0].cpu().numpy()
+        genuineProb = float(probs[0])
+        fakeProb = float(probs[1])
+        isFake = fakeProb > optimalThreshold
+        finalPred = 1 if isFake else 0
+        agreementCount = sum(1 for p in allPreds if p == finalPred)
+        agreement = float(agreementCount) / len(allPreds)
+        del weightedProbs, allPreds
+        gc.collect()
+        return {
+            'genuineProb': genuineProb,
+            'fakeProb': fakeProb,
+            'isFake': isFake,
+            'agreement': agreement
+        }
+    except Exception as e:
+        print(f"prediction error: {str(e)}", flush=True)
+        return {
+            'fakeProb': 0.5,
+            'genuineProb': 0.5,
+            'isFake': None,
+            'agreement': 0.0,
+            'error': str(e)
+        }
 def getLengthCategory(text):
+    if not text:
+        return 'empty'
     words = text.split()
     wordCount = len(words)
     if wordCount <= 20:
         return 'very-long'
 def predict_review(text):
+    if not text or not isinstance(text, str):
+        return {
+            "prediction": "error",
+            "confidence": 0.0,
+            "is_fake": None,
+            "model_agreement": 0.0,
+            "fake_probability": 0.0,
+            "genuine_probability": 0.0,
+            "length_category": "empty",
+            "token_count": 0,
+            "error": "invalid input: text must be non-empty string"
+        }
+    cleaned = cleanReview(text)
+    if not cleaned or len(cleaned.strip()) == 0:
         return {
+            "prediction": "error",
             "confidence": 0.0,
+            "is_fake": None,
+            "model_agreement": 0.0,
+            "fake_probability": 0.0,
+            "genuine_probability": 0.0,
+            "length_category": "empty",
+            "token_count": 0,
             "error": "empty text after preprocessing"
         }
+    result = ensemblePredict(text)
+    if 'error' in result:
+        return {
+            "prediction": "error",
+            "confidence": 0.0,
+            "is_fake": None,
+            "model_agreement": result['agreement'],
+            "fake_probability": result['fakeProb'],
+            "genuine_probability": result['genuineProb'],
+            "length_category": getLengthCategory(cleaned),
+            "token_count": len(cleaned.split()),
+            "error": result['error']
+        }
+    fakeProb = result['fakeProb']
+    genuineProb = result['genuineProb']
+    isFake = result['isFake']
+    agreement = result['agreement']
+    confidence = max(fakeProb, genuineProb)
+    if agreement < uncertaintyThreshold:
         prediction = "uncertain"
+        isFakeOutput = None
     else:
+        prediction = "fake" if isFake else "genuine"
+        isFakeOutput = isFake
     lengthCat = getLengthCategory(cleaned)
+    tokenCount = len(cleaned.split())
     return {
         "prediction": prediction,
         "confidence": float(confidence),
+        "is_fake": isFakeOutput,
+        "model_agreement": float(agreement),
         "fake_probability": float(fakeProb),
+        "genuine_probability": float(genuineProb),
+        "length_category": lengthCat,
+        "token_count": tokenCount
     }