Upload 6 files
Browse files- .gitattributes +1 -0
- app.py +92 -0
- config.json +25 -8
- generation_config.json +1 -1
- tokenizer.json +0 -0
- tokenizer_config.json +5 -2
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import (
|
| 2 |
+
AutoModelForSequenceClassification,
|
| 3 |
+
AutoTokenizer,
|
| 4 |
+
TrainingArguments,
|
| 5 |
+
Trainer
|
| 6 |
+
)
|
| 7 |
+
from datasets import load_dataset
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
# 1. Load dataset
|
| 11 |
+
dataset = load_dataset("zxc4wewewe/offsec")
|
| 12 |
+
|
| 13 |
+
# 2. Add labels (required for classification)
|
| 14 |
+
# Modify based on your actual classification task:
|
| 15 |
+
def add_labels(example):
|
| 16 |
+
# Example: Classify if prompt is malicious (1) or benign (0)
|
| 17 |
+
# Replace this logic with your actual labels!
|
| 18 |
+
malicious_keywords = ['hack', 'exploit', 'crack', 'bypass', 'inject']
|
| 19 |
+
text_lower = example["prompt"].lower()
|
| 20 |
+
example["labels"] = 1 if any(kw in text_lower for kw in malicious_keywords) else 0
|
| 21 |
+
return example
|
| 22 |
+
|
| 23 |
+
dataset = dataset.map(add_labels)
|
| 24 |
+
|
| 25 |
+
# 3. Load Tokenizer
|
| 26 |
+
tokenizer = AutoTokenizer.from_pretrained("zxc4wewewe/blackthinking")
|
| 27 |
+
if tokenizer.pad_token is None:
|
| 28 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 29 |
+
|
| 30 |
+
# 4. Tokenize dataset
|
| 31 |
+
def tokenize_function(batch):
|
| 32 |
+
tokenized = tokenizer(
|
| 33 |
+
batch["prompt"],
|
| 34 |
+
padding=True,
|
| 35 |
+
truncation=True,
|
| 36 |
+
max_length=512
|
| 37 |
+
)
|
| 38 |
+
tokenized["labels"] = batch["labels"]
|
| 39 |
+
return tokenized
|
| 40 |
+
|
| 41 |
+
dataset = dataset.map(tokenize_function, batched=True)
|
| 42 |
+
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
| 43 |
+
|
| 44 |
+
# 5. Load Model with SafeTensors support
|
| 45 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 46 |
+
"zxc4wewewe/blackthinking",
|
| 47 |
+
num_labels=2,
|
| 48 |
+
torch_dtype=torch.float16, # Optional: saves memory
|
| 49 |
+
use_safetensors=True # Force SafeTensors loading
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# 6. Training Arguments with SafeTensors saving
|
| 53 |
+
training_args = TrainingArguments(
|
| 54 |
+
output_dir="./safetensors_results",
|
| 55 |
+
num_train_epochs=3,
|
| 56 |
+
per_device_train_batch_size=4,
|
| 57 |
+
gradient_accumulation_steps=2,
|
| 58 |
+
learning_rate=2e-5,
|
| 59 |
+
logging_steps=10,
|
| 60 |
+
save_strategy="epoch",
|
| 61 |
+
|
| 62 |
+
# SafeTensors Configuration
|
| 63 |
+
save_safetensors=True, # Save as .safetensors (not .bin)
|
| 64 |
+
load_best_model_at_end=True,
|
| 65 |
+
|
| 66 |
+
# Optional optimizations
|
| 67 |
+
fp16=torch.cuda.is_available(), # Use FP16 if GPU available
|
| 68 |
+
report_to="none"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# 7. Initialize Trainer
|
| 72 |
+
trainer = Trainer(
|
| 73 |
+
model=model,
|
| 74 |
+
args=training_args,
|
| 75 |
+
train_dataset=dataset["train"].shuffle(seed=42).select(range(1000)),
|
| 76 |
+
eval_dataset=dataset["test"].shuffle(seed=42).select(range(200)) if "test" in dataset else None,
|
| 77 |
+
tokenizer=tokenizer,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# 8. Train and Save
|
| 81 |
+
print("Starting training with SafeTensors format...")
|
| 82 |
+
trainer.train()
|
| 83 |
+
|
| 84 |
+
# Save final model in SafeTensors format
|
| 85 |
+
trainer.save_model("./final_safetensors_model")
|
| 86 |
+
print("Model saved in SafeTensors format!")
|
| 87 |
+
|
| 88 |
+
# 9. Verification - Check files
|
| 89 |
+
import os
|
| 90 |
+
model_path = "./final_safetensors_model"
|
| 91 |
+
files = os.listdir(model_path)
|
| 92 |
+
print("Saved files:", [f for f in files if f.endswith(('.safetensors', '.json', '.txt'))])
|
config.json
CHANGED
|
@@ -1,16 +1,13 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"architectures": [
|
| 4 |
"LlamaForCausalLM"
|
| 5 |
],
|
| 6 |
"attention_bias": false,
|
| 7 |
"attention_dropout": 0.0,
|
| 8 |
"bos_token_id": 128000,
|
| 9 |
-
"eos_token_id":
|
| 10 |
-
|
| 11 |
-
128008,
|
| 12 |
-
128009
|
| 13 |
-
],
|
| 14 |
"hidden_act": "silu",
|
| 15 |
"hidden_size": 4096,
|
| 16 |
"initializer_range": 0.02,
|
|
@@ -23,6 +20,26 @@
|
|
| 23 |
"num_key_value_heads": 8,
|
| 24 |
"pad_token_id": 128004,
|
| 25 |
"pretraining_tp": 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"rms_norm_eps": 1e-05,
|
| 27 |
"rope_scaling": {
|
| 28 |
"factor": 8.0,
|
|
@@ -34,8 +51,8 @@
|
|
| 34 |
"rope_theta": 500000.0,
|
| 35 |
"tie_word_embeddings": false,
|
| 36 |
"torch_dtype": "bfloat16",
|
| 37 |
-
"transformers_version": "4.
|
| 38 |
-
"
|
| 39 |
"use_cache": true,
|
| 40 |
"vocab_size": 128256
|
| 41 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "meta-llama/meta-Llama-3.1-8B-Instruct",
|
| 3 |
"architectures": [
|
| 4 |
"LlamaForCausalLM"
|
| 5 |
],
|
| 6 |
"attention_bias": false,
|
| 7 |
"attention_dropout": 0.0,
|
| 8 |
"bos_token_id": 128000,
|
| 9 |
+
"eos_token_id": 128009,
|
| 10 |
+
"head_dim": 128,
|
|
|
|
|
|
|
|
|
|
| 11 |
"hidden_act": "silu",
|
| 12 |
"hidden_size": 4096,
|
| 13 |
"initializer_range": 0.02,
|
|
|
|
| 20 |
"num_key_value_heads": 8,
|
| 21 |
"pad_token_id": 128004,
|
| 22 |
"pretraining_tp": 1,
|
| 23 |
+
"quantization_config": {
|
| 24 |
+
"_load_in_4bit": true,
|
| 25 |
+
"_load_in_8bit": false,
|
| 26 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 27 |
+
"bnb_4bit_quant_storage": "uint8",
|
| 28 |
+
"bnb_4bit_quant_type": "nf4",
|
| 29 |
+
"bnb_4bit_use_double_quant": true,
|
| 30 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
| 31 |
+
"llm_int8_has_fp16_weight": false,
|
| 32 |
+
"llm_int8_skip_modules": [
|
| 33 |
+
"lm_head",
|
| 34 |
+
"multi_modal_projector",
|
| 35 |
+
"merger",
|
| 36 |
+
"modality_projection"
|
| 37 |
+
],
|
| 38 |
+
"llm_int8_threshold": 6.0,
|
| 39 |
+
"load_in_4bit": true,
|
| 40 |
+
"load_in_8bit": false,
|
| 41 |
+
"quant_method": "bitsandbytes"
|
| 42 |
+
},
|
| 43 |
"rms_norm_eps": 1e-05,
|
| 44 |
"rope_scaling": {
|
| 45 |
"factor": 8.0,
|
|
|
|
| 51 |
"rope_theta": 500000.0,
|
| 52 |
"tie_word_embeddings": false,
|
| 53 |
"torch_dtype": "bfloat16",
|
| 54 |
+
"transformers_version": "4.49.0.dev0",
|
| 55 |
+
"unsloth_fixed": true,
|
| 56 |
"use_cache": true,
|
| 57 |
"vocab_size": 128256
|
| 58 |
}
|
generation_config.json
CHANGED
|
@@ -10,5 +10,5 @@
|
|
| 10 |
"pad_token_id": 128004,
|
| 11 |
"temperature": 0.6,
|
| 12 |
"top_p": 0.9,
|
| 13 |
-
"transformers_version": "4.
|
| 14 |
}
|
|
|
|
| 10 |
"pad_token_id": 128004,
|
| 11 |
"temperature": 0.6,
|
| 12 |
"top_p": 0.9,
|
| 13 |
+
"transformers_version": "4.49.0.dev0"
|
| 14 |
}
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"added_tokens_decoder": {
|
| 3 |
"128000": {
|
| 4 |
"content": "<|begin_of_text|>",
|
|
@@ -2053,12 +2054,14 @@
|
|
| 2053 |
"chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
|
| 2054 |
"clean_up_tokenization_spaces": true,
|
| 2055 |
"eos_token": "<|eot_id|>",
|
|
|
|
| 2056 |
"model_input_names": [
|
| 2057 |
"input_ids",
|
| 2058 |
"attention_mask"
|
| 2059 |
],
|
| 2060 |
"model_max_length": 131072,
|
| 2061 |
"pad_token": "<|finetune_right_pad_id|>",
|
| 2062 |
-
"padding_side": "
|
| 2063 |
-
"tokenizer_class": "
|
|
|
|
| 2064 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
"added_tokens_decoder": {
|
| 4 |
"128000": {
|
| 5 |
"content": "<|begin_of_text|>",
|
|
|
|
| 2054 |
"chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
|
| 2055 |
"clean_up_tokenization_spaces": true,
|
| 2056 |
"eos_token": "<|eot_id|>",
|
| 2057 |
+
"extra_special_tokens": {},
|
| 2058 |
"model_input_names": [
|
| 2059 |
"input_ids",
|
| 2060 |
"attention_mask"
|
| 2061 |
],
|
| 2062 |
"model_max_length": 131072,
|
| 2063 |
"pad_token": "<|finetune_right_pad_id|>",
|
| 2064 |
+
"padding_side": "left",
|
| 2065 |
+
"tokenizer_class": "PreTrainedTokenizer",
|
| 2066 |
+
"unk_token": null
|
| 2067 |
}
|