idanpers
/

JailBreakModel

@@ -56,41 +56,60 @@ To run the inference pipeline for classifying prompts, follow these steps:
   # Load model directly
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-tokenizer = AutoTokenizer.from_pretrained("idanpers/JailBreakModel")
 model = AutoModelForSequenceClassification.from_pretrained("idanpers/JailBreakModel")
 use:
-  # Function to classify a single prompt using the trained model in Trainer
   def classify_prompt(prompt):
-    # Error handling for empty input
-    if not isinstance(prompt, str) or prompt.strip() == "":
-        return {"error": "Invalid input. Please provide a non-empty text prompt."}
-    # Tokenize the input prompt and convert to dataset format expected by trainer.predict
-    inputs = Tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
-    dataset = Dataset.from_dict({"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]})
-    # Use trainer.predict to classify
-    prediction_output = model.predict(dataset)
-    # Get the softmax probabilities for confidence scores
-    probs = torch.softmax(torch.tensor(prediction_output.predictions), dim=1).cpu().numpy()
-    confidence = np.max(probs)
-    pred_label = np.argmax(probs, axis=1)[0]
-    # Map prediction to label
-    label = "PROMPT_INJECTION" if pred_label == 1 else "BENIGN"
-    return {"label": label, "confidence": confidence}
-   #Accept input from the user and classify it
 prompt = input("Enter a prompt for classification: ")
 result = classify_prompt(prompt)
-   #Check for errors before accessing the classification result
 if "error" in result:
-    print(f"Error: {result['error']}")
 else:
-    print(f"Classification Result: {result['label']}")
-    print(f"Confidence Score: {result['confidence']:.2f}")

   # Load model directly
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+Tokenizer = AutoTokenizer.from_pretrained("idanpers/JailBreakModel")
 model = AutoModelForSequenceClassification.from_pretrained("idanpers/JailBreakModel")
+training_args = TrainingArguments(
+    output_dir="./results",
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    report_to="none",  # Disable W&B
+    save_safetensors=False,
+)
+# Create Trainer instance
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    tokenizer=tokenizer,
+)
 use:
   def classify_prompt(prompt):
+  # Error handling for empty input
+  if not isinstance(prompt, str) or prompt.strip() == "":
+      return {"error": "Invalid input. Please provide a non-empty text prompt."}
+  # Tokenize the input prompt and convert to dataset format expected by trainer.predict
+  inputs = Tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
+  dataset = Dataset.from_dict({"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]})
+  # Use trainer.predict to classify
+  prediction_output = trainer.predict(dataset)
+  # Get the softmax probabilities for confidence scores
+  probs = torch.softmax(torch.tensor(prediction_output.predictions), dim=1).cpu().numpy()
+  confidence = np.max(probs)
+  pred_label = np.argmax(probs, axis=1)[0]
+  # Map prediction to label
+  label = "PROMPT_INJECTION" if pred_label == 1 else "BENIGN"
+  return {"label": label, "confidence": confidence}
+ #Accept input from the user and classify it
 prompt = input("Enter a prompt for classification: ")
 result = classify_prompt(prompt)
+ #Check for errors before accessing the classification result
 if "error" in result:
+  print(f"Error: {result['error']}")
 else:
+  print(f"Classification Result: {result['label']}")
+  print(f"Confidence Score: {result['confidence']:.2f}")