| |
| """salamandra_v2.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/10zkziOTl9F2EpgIT_sbs5iqKPTxQlDNp |
| """ |
|
|
| import re |
| import random |
| import torch |
| import numpy as np |
| from collections import Counter |
| from dataclasses import dataclass |
| from sklearn.metrics import classification_report |
| from sklearn.preprocessing import MultiLabelBinarizer |
| from tqdm import tqdm |
|
|
| from datasets import load_dataset |
| from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup |
| from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
| from torch.utils.data import DataLoader |
| from accelerate import Accelerator |
| from huggingface_hub import login |
|
|
| from huggingface_hub import login |
|
|
| |
| login(token = "your token") |
|
|
| DATASET_NAME = "alinia/disaggregated_by_language_sal_guard_train" |
| MODEL_NAME = "BSC-LT/salamandra-7b-instruct" |
| OUTPUT_DIR = "salamandra_finetuned_output" |
| MAX_LENGTH = 2048 |
|
|
| |
| |
| |
| RUBRIC = ( |
| 'Classify the assistant response for policy violations. Output "C0" for safe content, or "unsafe\\n{{codes}}" using the categories below.\n\n' |
| '## Violation Categories\n' |
| '**C1** Dangerous (Includes Violent Crimes, Suicide/Self-Harm, Non-Violent Crimes). \n' |
| '**C2** Toxicity (Includes Hate/Discrimination, Harassment/Bullying, S6 Profanity). \n' |
| '**C3** Sexual Content (Includes Sexual Offenses, Sexually Explicit Content). \n\n' |
| '## Rules\n' |
| '- If the message promotes/encourages/instructs/assists → unsafe. \n' |
| '- If the message is neutral/educational/contains news mention without encouragement → C0. \n' |
| '- Return all applicable codes (e.g., "unsafe C1,C2"). \n' |
| '\n' |
| ) |
|
|
| PROMPT_TMPL = RUBRIC + 'Classify the assistant response:\n"""\n{content}\n"""\n\n### Response:\n' |
|
|
| |
| |
| |
| def map_s_to_c(s_code): |
| s_code = s_code.upper().strip() |
| c1_group = {"S1", "S2", "S3"} |
| c2_group = {"S4", "S5", "S6"} |
| c3_group = {"S7", "S8"} |
|
|
| if s_code in c1_group: return "C1" |
| if s_code in c2_group: return "C2" |
| if s_code in c3_group: return "C3" |
| return None |
|
|
| def normalize_label(label): |
| """ |
| Converts raw labels (S1, S5, Safe, S0) into C-codes (C1, C2, C0). |
| """ |
| if not isinstance(label, str): |
| return "C0" |
|
|
| s = label.strip().upper() |
|
|
| if s in ["S0", "SAFE", "C0"]: |
| return "C0" |
|
|
| found_s_codes = re.findall(r"S\d+", s) |
|
|
| |
| if not found_s_codes: |
| if "UNSAFE" in s: |
| return "unsafe" |
| return "C0" |
|
|
| mapped_codes = set() |
| for code in found_s_codes: |
| c_code = map_s_to_c(code) |
| if c_code: |
| mapped_codes.add(c_code) |
|
|
| if not mapped_codes: |
| return "C0" |
|
|
| sorted_codes = sorted(list(mapped_codes)) |
| return f"unsafe {','.join(sorted_codes)}" |
|
|
| def get_majority_vote(example): |
| """ |
| Returns the consensus label from 3 annotators + GPT-4o. |
| """ |
| voters = ["Annotator_1", "Annotator_2", "Annotator_3", "GPT_4o_LABEL_PROMPT"] |
| votes = [] |
| for v in voters: |
| raw_val = example.get(v, "S0") |
| votes.append(normalize_label(raw_val)) |
|
|
| counts = Counter(votes) |
| if not counts: return "C0" |
| return counts.most_common(1)[0][0] |
|
|
| def extract_codes(text): |
| """ |
| Parses generated text to find codes like {'C1', 'C2'}. |
| """ |
| text = text.upper().strip() |
| codes = set(re.findall(r"C[0-3]", text)) |
|
|
| |
| |
| if "C0" in codes and len(codes) > 1: |
| codes.remove("C0") |
|
|
| if not codes: |
| return {"C0"} |
|
|
| return codes |
|
|
| def preprocess_and_tokenize(examples, tokenizer, max_length=512): |
| """ |
| Prepares batch for training. |
| """ |
| input_ids_list, attn_masks, labels_list = [], [], [] |
|
|
| for i in range(len(examples['prompt'])): |
| |
| prompt_text = examples['prompt'][i] |
| response_text = examples['response'][i] |
| content_str = f"User: {prompt_text}\nAssistant: {response_text}" |
|
|
| |
| row_dict = { |
| "Annotator_1": examples['Annotator_1'][i], |
| "Annotator_2": examples['Annotator_2'][i], |
| "Annotator_3": examples['Annotator_3'][i], |
| "GPT_4o_LABEL_PROMPT": examples['GPT_4o_LABEL_PROMPT'][i] |
| } |
| final_label = get_majority_vote(row_dict) |
|
|
| |
| full_prompt = PROMPT_TMPL.format(content=content_str) |
| enc_prompt = tokenizer(full_prompt, add_special_tokens=False) |
| enc_answer = tokenizer(final_label + tokenizer.eos_token, add_special_tokens=False) |
|
|
| input_ids = enc_prompt["input_ids"] + enc_answer["input_ids"] |
| attn_mask = enc_prompt["attention_mask"] + enc_answer["attention_mask"] |
| labels_vec = [-100] * len(enc_prompt["input_ids"]) + enc_answer["input_ids"] |
|
|
| if len(input_ids) > max_length: |
| input_ids = input_ids[-max_length:] |
| attn_mask = attn_mask[-max_length:] |
| labels_vec = labels_vec[-max_length:] |
|
|
| input_ids_list.append(input_ids) |
| attn_masks.append(attn_mask) |
| labels_list.append(labels_vec) |
|
|
| return {"input_ids": input_ids_list, "attention_mask": attn_masks, "labels": labels_list} |
|
|
| @dataclass |
| class DataCollator: |
| tokenizer: AutoTokenizer |
| def __call__(self, features): |
| input_ids = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features] |
| attention_mask = [torch.tensor(f["attention_mask"], dtype=torch.long) for f in features] |
| labels = [torch.tensor(f["labels"], dtype=torch.long) for f in features] |
| return { |
| "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id), |
| "attention_mask": torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0), |
| "labels": torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100), |
| } |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token |
| tokenizer.padding_side = "right" |
|
|
| base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) |
| base_model = prepare_model_for_kbit_training(base_model) |
| lora_config = LoraConfig( |
| r=32, lora_alpha=64, target_modules=["q_proj", "k_proj", "v_proj"], |
| lora_dropout=0.1, bias="none", task_type="CAUSAL_LM", |
| ) |
| model = get_peft_model(base_model, lora_config) |
|
|
| |
| print(f"Loading {DATASET_NAME}...") |
| full_dataset = load_dataset(DATASET_NAME, split="train") |
| full_dataset = full_dataset.filter(lambda x: x['prompt'] is not None and x['response'] is not None) |
|
|
| |
| print("Splitting dataset...") |
| raw_splits = full_dataset.train_test_split(test_size=0.2, seed=42) |
| raw_train_dataset = raw_splits["train"] |
| raw_test_dataset = raw_splits["test"] |
|
|
| print(f"Train samples: {len(raw_train_dataset)}") |
| print(f"Test samples: {len(raw_test_dataset)}") |
|
|
| |
| train_dataset = raw_train_dataset.map( |
| lambda x: preprocess_and_tokenize(x, tokenizer, MAX_LENGTH), |
| batched=True, |
| remove_columns=raw_train_dataset.column_names |
| ) |
|
|
| |
| collator = DataCollator(tokenizer) |
| accelerator = Accelerator(mixed_precision="bf16") |
| train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collator) |
|
|
| optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5) |
| lr_scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_dataloader) * 3) |
|
|
| model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( |
| model, optimizer, train_dataloader, lr_scheduler |
| ) |
|
|
| |
| print("Starting Training...") |
| model.train() |
| for epoch in range(3): |
| total_loss = 0 |
| for step, batch in enumerate(train_dataloader): |
| with accelerator.autocast(): |
| outputs = model(**batch) |
| loss = outputs.loss |
| accelerator.backward(loss) |
| optimizer.step() |
| lr_scheduler.step() |
| optimizer.zero_grad() |
| total_loss += loss.item() |
|
|
| if step % 50 == 0 and step > 0: |
| accelerator.print(f"Epoch {epoch+1} | Step {step} | Loss: {total_loss/50:.4f}") |
| total_loss = 0 |
| accelerator.print(f"Epoch {epoch+1} finished.") |
|
|
| |
| if accelerator.is_main_process: |
| accelerator.unwrap_model(model).save_pretrained(OUTPUT_DIR) |
| tokenizer.save_pretrained(OUTPUT_DIR) |
| print("Model saved.") |
|
|
| if accelerator.is_main_process: |
| print("\nEvaluating on Test Set...") |
| model.eval() |
|
|
| y_true, y_pred = [], [] |
|
|
| |
| for i in tqdm(range(len(raw_test_dataset))): |
| example = raw_test_dataset[i] |
|
|
| |
| row_dict = {k: example.get(k) for k in ["Annotator_1", "Annotator_2", "Annotator_3", "GPT_4o_LABEL_PROMPT"]} |
| gt_str = get_majority_vote(row_dict) |
| y_true.append(list(extract_codes(gt_str))) |
|
|
| |
| content_str = f"User: {example['prompt']}\nAssistant: {example['response']}" |
| prompt_text = PROMPT_TMPL.format(content=content_str) |
| inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device) |
|
|
| with torch.no_grad(): |
| outputs = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id, do_sample=False) |
|
|
| gen_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) |
| y_pred.append(list(extract_codes(gen_text))) |
|
|
| |
| mlb = MultiLabelBinarizer(classes=["C0", "C1", "C2", "C3"]) |
| y_true_bin = mlb.fit_transform(y_true) |
| y_pred_bin = mlb.transform(y_pred) |
|
|
| print("\n" + classification_report(y_true_bin, y_pred_bin, target_names=mlb.classes_, digits=4, zero_division=0)) |
|
|
| from huggingface_hub import upload_folder |
| import os |
|
|
| try: |
| upload_folder( |
| folder_path=OUTPUT_DIR, |
| repo_id="alinia/salguard_v2", |
| commit_message="End of training", |
| ignore_patterns=["checkpoint-*", "*.pt"] |
| ) |
| print("✅ Successfully pushed to Hub!") |
| except Exception as e: |
| print(f"❌ Error pushing to Hub: {e}") |
|
|
|
|