Spaces:
Sleeping
Sleeping
| import torch, pathlib, nltk | |
| from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification | |
| from app.preprocess import split_sentences | |
| nltk.download('punkt', quiet=True) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| MODEL_PATH = pathlib.Path(__file__).parent.parent / "models/distilbert_innocence" | |
| tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH) | |
| model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH).to(device).eval() | |
| def predict_sentences(text: str, cutoff: float = 0.70): | |
| sents = split_sentences(text) | |
| if not sents: | |
| return [] | |
| encoded = tokenizer(sents, truncation=True, padding=True, | |
| max_length=128, return_tensors='pt').to(device) | |
| logits = model(**encoded).logits | |
| probs = torch.softmax(logits, dim=1)[:, 1].cpu().tolist() | |
| return [{'sentence': s, 'confidence': round(p, 3)} | |
| for s, p in zip(sents, probs) if p >= cutoff] |