| | |
| | |
| |
|
| |
|
| | from sklearn.pipeline import Pipeline |
| |
|
| | from skops import card, hub_utils |
| |
|
| | from datasets import load_dataset |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| |
|
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.metrics import classification_report |
| | import os |
| | from skops.io import dump |
| | from pathlib import Path |
| | from tempfile import mkdtemp, mkstemp |
| | import sklearn |
| | from argparse import ArgumentParser |
| |
|
| |
|
| | |
| |
|
| | data = "deepset/prompt-injections" |
| | save_directory = "models" |
| | model_name = "prompt_protect_model" |
| | repo_id = "thevgergroup/prompt_protect" |
| | upload = False |
| | commit_message = "Initial commit" |
| |
|
| | X_train, X_test, y_train, y_test = None, None, None, None |
| |
|
| |
|
| | def load_data(data): |
| | |
| | dataset = load_dataset(data) |
| | return dataset |
| |
|
| |
|
| | def split_data(dataset): |
| | global X_train, X_test, y_train, y_test |
| | |
| | |
| | df_train = dataset['train'].to_pandas() |
| | df_test = dataset['test'].to_pandas() |
| | X_train = df_train['text'] |
| | y_train = df_train['label'] |
| | X_test = df_test['text'] |
| | y_test = df_test['label'] |
| | |
| | |
| |
|
| | def train_model(X_train, y_train): |
| | |
| | model = Pipeline( |
| | [ |
| | ("vectorize",TfidfVectorizer(max_features=5000) ), |
| | ("lgr", LogisticRegression()), |
| | ] |
| | ) |
| | |
| | model.fit(X_train, y_train) |
| | |
| | return model |
| |
|
| | def evaluate_model(model): |
| | |
| | global X_train, X_test, y_train, y_test |
| | y_pred = model.predict(X_test) |
| | return classification_report(y_test, y_pred) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | |
| | parser = ArgumentParser() |
| | parser.add_argument("--data", type=str, default="deepset/prompt-injections", help="Dataset to use for training, expects a huggingface dataset with train and test splits and text / label columns") |
| | parser.add_argument("--save_directory", type=str, default="models/thevgergroup", help="Directory to save the model to") |
| | parser.add_argument("--model_name", type=str, default="prompt_protect_model", help="Name of the model file, will have .skops extension added to it") |
| | parser.add_argument("--repo_id", type=str, default="thevgergroup/prompt_protect", help="Repo to push the model to") |
| | parser.add_argument("--upload", action="store_true", help="Upload the model to the hub, must be a contributor to the repo") |
| | parser.add_argument("--commit-message", type=str, default="Initial commit", help="Commit message for the model push") |
| | |
| | args = parser.parse_args() |
| | |
| | if any(vars(args).values()): |
| | data = args.data |
| | save_directory = args.save_directory |
| | model_name = args.model_name |
| | repo_id = args.repo_id |
| | upload = args.upload |
| | commit_message = args.commit_message |
| | |
| | |
| | dataset = load_data(data) |
| | split_data(dataset) |
| | model = train_model(X_train=X_train, y_train=y_train) |
| | report = evaluate_model(model) |
| | print(report) |
| | |
| | |
| | |
| | model_path = os.path.join(save_directory) |
| | print("Saving model to", model_path) |
| | os.makedirs(model_path, exist_ok=True) |
| | |
| | model_file = os.path.join(model_path, f"{model_name}.skops") |
| | |
| | dump(model, file=model_file) |
| | |
| | |
| | if upload: |
| | |
| | local_repo = mkdtemp(prefix="skops-") |
| | print("Creating local repo at", local_repo) |
| | hub_utils.init( model=model_file, |
| | dst=local_repo, |
| | requirements=[f"scikit-learn={sklearn.__version__}"], |
| | task="text-classification", |
| | data=X_test.to_list(), |
| | ) |
| | |
| | hub_utils.add_files(__file__, dst=local_repo, exist_ok=True ) |
| | |
| | hub_utils.push(source=local_repo, repo_id=repo_id, commit_message=commit_message) |
| | |
| | |
| |
|