| | from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| | import torch |
| | import pandas as pd |
| |
|
| | |
| | model_name = "PL-RnD/privacy-moderation-large-4bit" |
| | |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | model = AutoModelForSequenceClassification.from_pretrained(model_name) |
| | |
| | model.to(device) |
| |
|
| | |
| | texts = [ |
| | "Here is my credit card number: 1234-5678-9012-3456", |
| | "This is a regular message without sensitive information.", |
| | "For homeowners insurance, select deductibles from $500 to $2,500. Higher deductibles lower premiums.", |
| | "Solidarity: My enrollment includes my kid's braces at $4,000 total—family strained. Push for orthodontic expansions. Email blast to reps starting now.", |
| | ] |
| | |
| | inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True) |
| | inputs = {k: v.to(device) for k, v in inputs.items()} |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| |
|
| | logits = outputs.logits |
| | predictions = torch.argmax(logits, dim=-1) |
| | |
| | labels = ["non-violation", "violation"] |
| | |
| | predicted_labels = [labels[pred] for pred in predictions.cpu().tolist()] |
| | |
| | df = pd.DataFrame({"text": texts, "label": predicted_labels}) |
| | print(df) |
| |
|