innocence_matrix / app /bert_model.py
pradoth's picture
Upload folder using huggingface_hub
a6b677f verified
import torch, pathlib, nltk
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from app.preprocess import split_sentences
nltk.download('punkt', quiet=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_PATH = pathlib.Path(__file__).parent.parent / "models/distilbert_innocence"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH).to(device).eval()
@torch.no_grad()
def predict_sentences(text: str, cutoff: float = 0.70):
sents = split_sentences(text)
if not sents:
return []
encoded = tokenizer(sents, truncation=True, padding=True,
max_length=128, return_tensors='pt').to(device)
logits = model(**encoded).logits
probs = torch.softmax(logits, dim=1)[:, 1].cpu().tolist()
return [{'sentence': s, 'confidence': round(p, 3)}
for s, p in zip(sents, probs) if p >= cutoff]