|
|
from datasets import load_dataset |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.neural_network import MLPClassifier |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.metrics import classification_report |
|
|
import numpy as np |
|
|
import joblib |
|
|
|
|
|
|
|
|
ds = load_dataset("nvidia/Aegis-AI-Content-Safety-Dataset-2.0") |
|
|
|
|
|
|
|
|
TEXT_COL = "response" |
|
|
LABEL_COL = "response_label" |
|
|
|
|
|
|
|
|
SAFE_TOKENS = {"safe"} |
|
|
def to_binary_label(raw): |
|
|
if raw is None: |
|
|
return 0 |
|
|
raw = str(raw).strip().lower() |
|
|
return 1 if raw in SAFE_TOKENS else 0 |
|
|
|
|
|
train = ds["train"] |
|
|
|
|
|
|
|
|
records = [r for r in train if r.get(TEXT_COL) and isinstance(r[TEXT_COL], str) and r[TEXT_COL].strip()] |
|
|
X = [r[TEXT_COL].strip() for r in records] |
|
|
y = [to_binary_label(r.get(LABEL_COL)) for r in records] |
|
|
|
|
|
|
|
|
X_temp, X_test, y_temp, y_test = train_test_split( |
|
|
X, y, test_size=0.15, random_state=42, stratify=y |
|
|
) |
|
|
|
|
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split( |
|
|
X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp |
|
|
) |
|
|
|
|
|
print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}") |
|
|
|
|
|
|
|
|
pipe = Pipeline([ |
|
|
("tfidf", TfidfVectorizer(max_features=100_000, ngram_range=(1,2), min_df=3)), |
|
|
("clf", MLPClassifier(hidden_layer_sizes=(128, 64), |
|
|
activation="relu", |
|
|
batch_size=256, |
|
|
early_stopping=True, |
|
|
max_iter=10, |
|
|
verbose=True, |
|
|
random_state=42)) |
|
|
]) |
|
|
|
|
|
pipe.fit(X_train, y_train) |
|
|
|
|
|
print("Validation results:") |
|
|
pred_val = pipe.predict(X_val) |
|
|
print(classification_report(y_val, pred_val, digits=3)) |
|
|
|
|
|
print("Test results:") |
|
|
pred_test = pipe.predict(X_test) |
|
|
print(classification_report(y_test, pred_test, digits=3)) |
|
|
|
|
|
print("Train accuracy:", pipe.score(X_train, y_train)) |
|
|
print("Val accuracy:", pipe.score(X_val, y_val)) |
|
|
print("Test accuracy:", pipe.score(X_test, y_test)) |
|
|
|
|
|
|
|
|
joblib.dump(pipe, "mlp_tfidf_aegis2.joblib") |
|
|
print("Saved to mlp_tfidf_aegis2.joblib") |