Text Generation
Transformers
Safetensors
llama
mergekit
Merge
text-generation-inference
zxc4wewewe commited on
Commit
ef22829
Β·
verified Β·
1 Parent(s): 79027b9

Upload 11 files

Browse files
Files changed (3) hide show
  1. README.md +47 -47
  2. app.py +275 -67
  3. tokenizer.json +3 -3
README.md CHANGED
@@ -1,47 +1,47 @@
1
- ---
2
- base_model:
3
- - Novaciano/Eurinoferus-3.2-1B
4
- - cazzz307/Abliterated-Llama-3.2-1B-Instruct
5
- library_name: transformers
6
- tags:
7
- - mergekit
8
- - merge
9
-
10
- ---
11
- # merge
12
-
13
- This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
14
-
15
- ## Merge Details
16
- ### Merge Method
17
-
18
- This model was merged using the [Arcee Fusion](https://arcee.ai) merge method using [Novaciano/Eurinoferus-3.2-1B](https://huggingface.co/Novaciano/Eurinoferus-3.2-1B) as a base.
19
-
20
- ### Models Merged
21
-
22
- The following models were included in the merge:
23
- * [cazzz307/Abliterated-Llama-3.2-1B-Instruct](https://huggingface.co/cazzz307/Abliterated-Llama-3.2-1B-Instruct)
24
-
25
- ### Configuration
26
-
27
- The following YAML configuration was used to produce this model:
28
-
29
- ```yaml
30
- dtype: float32
31
- out_dtype: bfloat16
32
- merge_method: arcee_fusion
33
- base_model: Novaciano/Eurinoferus-3.2-1B
34
- models:
35
- - model: Novaciano/Eurinoferus-3.2-1B
36
- parameters:
37
- weight:
38
- - filter: mlp
39
- value: [1, 2]
40
- - value: 1
41
- - model: cazzz307/Abliterated-Llama-3.2-1B-Instruct
42
- parameters:
43
- weight:
44
- - filter: lm_head
45
- value: 1
46
- - value: [1, 0.5]
47
- ```
 
1
+ ---
2
+ base_model:
3
+ - Novaciano/Eurinoferus-3.2-1B
4
+ - cazzz307/Abliterated-Llama-3.2-1B-Instruct
5
+ library_name: transformers
6
+ tags:
7
+ - mergekit
8
+ - merge
9
+
10
+ ---
11
+ # merge
12
+
13
+ This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
14
+
15
+ ## Merge Details
16
+ ### Merge Method
17
+
18
+ This model was merged using the [Arcee Fusion](https://arcee.ai) merge method using [Novaciano/Eurinoferus-3.2-1B](https://huggingface.co/Novaciano/Eurinoferus-3.2-1B) as a base.
19
+
20
+ ### Models Merged
21
+
22
+ The following models were included in the merge:
23
+ * [cazzz307/Abliterated-Llama-3.2-1B-Instruct](https://huggingface.co/cazzz307/Abliterated-Llama-3.2-1B-Instruct)
24
+
25
+ ### Configuration
26
+
27
+ The following YAML configuration was used to produce this model:
28
+
29
+ ```yaml
30
+ dtype: float32
31
+ out_dtype: bfloat16
32
+ merge_method: arcee_fusion
33
+ base_model: Novaciano/Eurinoferus-3.2-1B
34
+ models:
35
+ - model: Novaciano/Eurinoferus-3.2-1B
36
+ parameters:
37
+ weight:
38
+ - filter: mlp
39
+ value: [1, 2]
40
+ - value: 1
41
+ - model: cazzz307/Abliterated-Llama-3.2-1B-Instruct
42
+ parameters:
43
+ weight:
44
+ - filter: lm_head
45
+ value: 1
46
+ - value: [1, 0.5]
47
+ ```
app.py CHANGED
@@ -1,92 +1,300 @@
 
 
 
1
  from transformers import (
2
- AutoModelForSequenceClassification,
3
- AutoTokenizer,
4
- TrainingArguments,
5
- Trainer
 
 
6
  )
7
- from datasets import load_dataset
8
- import torch
9
 
10
- # 1. Load dataset
11
- dataset = load_dataset("zxc4wewewe/offsec")
12
 
13
- # 2. Add labels (required for classification)
14
- # Modify based on your actual classification task:
15
- def add_labels(example):
16
- # Example: Classify if prompt is malicious (1) or benign (0)
17
- # Replace this logic with your actual labels!
18
- malicious_keywords = ['hack', 'exploit', 'crack', 'bypass', 'inject']
19
- text_lower = example["prompt"].lower()
20
- example["labels"] = 1 if any(kw in text_lower for kw in malicious_keywords) else 0
21
- return example
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- dataset = dataset.map(add_labels)
 
 
24
 
25
- # 3. Load Tokenizer
26
- tokenizer = AutoTokenizer.from_pretrained("zxc4wewewe/blackthinking")
27
  if tokenizer.pad_token is None:
28
  tokenizer.pad_token = tokenizer.eos_token
 
29
 
30
- # 4. Tokenize dataset
31
- def tokenize_function(batch):
32
- tokenized = tokenizer(
33
- batch["prompt"],
34
- padding=True,
35
- truncation=True,
36
- max_length=512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  )
38
- tokenized["labels"] = batch["labels"]
39
- return tokenized
40
-
41
- dataset = dataset.map(tokenize_function, batched=True)
42
- dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
43
-
44
- # 5. Load Model with SafeTensors support
45
- model = AutoModelForSequenceClassification.from_pretrained(
46
- "zxc4wewewe/blackthinking",
47
- num_labels=2,
48
- torch_dtype=torch.float16, # Optional: saves memory
49
- use_safetensors=True # Force SafeTensors loading
50
  )
51
 
52
- # 6. Training Arguments with SafeTensors saving
 
 
 
 
 
 
 
53
  training_args = TrainingArguments(
54
- output_dir="./safetensors_results",
55
- num_train_epochs=3,
56
- per_device_train_batch_size=4,
57
- gradient_accumulation_steps=2,
58
- learning_rate=2e-5,
59
- logging_steps=10,
60
- save_strategy="epoch",
61
-
62
- # SafeTensors Configuration
63
- save_safetensors=True, # Save as .safetensors (not .bin)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  load_best_model_at_end=True,
 
 
 
 
 
 
 
65
 
66
- # Optional optimizations
67
- fp16=torch.cuda.is_available(), # Use FP16 if GPU available
68
- report_to="none"
69
  )
70
 
71
- # 7. Initialize Trainer
72
  trainer = Trainer(
73
  model=model,
74
  args=training_args,
75
- train_dataset=dataset["train"].shuffle(seed=42).select(range(1000)),
76
- eval_dataset=dataset["test"].shuffle(seed=42).select(range(200)) if "test" in dataset else None,
 
77
  tokenizer=tokenizer,
 
78
  )
79
 
80
- # 8. Train and Save
81
- print("Starting training with SafeTensors format...")
82
- trainer.train()
 
83
 
84
- # Save final model in SafeTensors format
85
- trainer.save_model("./final_safetensors_model")
86
- print("Model saved in SafeTensors format!")
 
 
 
 
87
 
88
- # 9. Verification - Check files
89
- import os
90
- model_path = "./final_safetensors_model"
91
- files = os.listdir(model_path)
92
- print("Saved files:", [f for f in files if f.endswith(('.safetensors', '.json', '.txt'))])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from datasets import load_dataset, Dataset, DatasetDict
4
  from transformers import (
5
+ AutoTokenizer,
6
+ AutoModelForCausalLM,
7
+ TrainingArguments,
8
+ Trainer,
9
+ DataCollatorForLanguageModeling,
10
+ EarlyStoppingCallback
11
  )
12
+ import shutil
 
13
 
 
 
14
 
15
+
16
+ # ─── Configuration ───────────────────────────────────────────────────────────
17
+ MODEL_NAME = "zxc4wewewe/blackthinking" # Your base model
18
+ OUTPUT_DIR = "./offsec_model"
19
+ MAX_LENGTH = 512
20
+ BATCH_SIZE = 4 # Adjust based on your VRAM
21
+ GRADIENT_ACCUMULATION = 4 # Effective batch = 16
22
+ EPOCHS = 3
23
+ LEARNING_RATE = 2e-5
24
+ SAVE_STEPS = 500
25
+ EVAL_STEPS = 500
26
+ LOGGING_STEPS = 50
27
+
28
+ # ─── 1. Load Dataset with Schema Handling ────────────────────────────────────
29
+ def load_and_fix_dataset():
30
+ """Load dataset handling both 'messages' and 'prompt/response' formats"""
31
+ cache_dir = os.path.expanduser("~/.cache/huggingface/hub/datasets--zxc4wewewe--offsec")
32
+
33
+ # Clear corrupted cache
34
+ if os.path.exists(cache_dir):
35
+ shutil.rmtree(cache_dir)
36
+
37
+ try:
38
+ # Try loading specific files first (avoid training-data-sample.parquet)
39
+ dataset = load_dataset("arcee-ai/LLama-405B-Logits")
40
+ except Exception as e:
41
+ print(f"Specific file load failed: {e}")
42
+ print("Trying generic load...")
43
+ dataset = load_dataset("zxc4wewewe/offsec")
44
+
45
+ # ─── Schema Normalization ────────────────────────────────────────────────
46
+ def normalize_example(example):
47
+ """Convert any format to prompt/response"""
48
+ # If already has prompt/response, return as-is
49
+ if "prompt" in example and "response" in example:
50
+ return {
51
+ "prompt": str(example["prompt"]) if example["prompt"] is not None else "",
52
+ "response": str(example["response"]) if example["response"] is not None else ""
53
+ }
54
+
55
+ # If has messages (chat format), convert
56
+ if "messages" in example and isinstance(example["messages"], list):
57
+ messages = example["messages"]
58
+ prompt = ""
59
+ response = ""
60
+
61
+ for msg in messages:
62
+ if isinstance(msg, dict):
63
+ role = msg.get("role", "")
64
+ content = msg.get("content", "")
65
+ if role == "user" or role == "human":
66
+ prompt = content
67
+ elif role == "assistant" or role == "bot":
68
+ response = content
69
+
70
+ return {"prompt": prompt, "response": response}
71
+
72
+ # Fallback: treat as single text field
73
+ text = str(example.get("text", example.get("content", "")))
74
+ # Try to split on common separators
75
+ if "Assistant:" in text or "Response:" in text:
76
+ parts = text.split("Assistant:", 1) if "Assistant:" in text else text.split("Response:", 1)
77
+ return {
78
+ "prompt": parts[0].replace("User:", "").strip(),
79
+ "response": parts[1].strip()
80
+ }
81
+
82
+ return {"prompt": text, "response": ""}
83
+
84
+ # Apply normalization
85
+ dataset = dataset.map(normalize_example, remove_columns=dataset["train"].column_names)
86
+
87
+ # Filter out empty examples
88
+ dataset = dataset.filter(lambda x: len(x["prompt"]) > 10 and len(x["response"]) > 5)
89
+
90
+ print(f"βœ“ Dataset loaded: {len(dataset['train'])} train, {len(dataset['test'])} test")
91
+ print(f"Sample: {dataset['train'][0]}")
92
+
93
+ return dataset
94
+
95
+ dataset = load_and_fix_dataset()
96
 
97
+ # ─── 2. Tokenizer & Model Setup ─────────────────────────────────────────────
98
+ print(f"\nLoading tokenizer and model: {MODEL_NAME}")
99
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
100
 
101
+ # Fix padding token for causal LM
 
102
  if tokenizer.pad_token is None:
103
  tokenizer.pad_token = tokenizer.eos_token
104
+ tokenizer.pad_token_id = tokenizer.eos_token_id
105
 
106
+ model = AutoModelForCausalLM.from_pretrained(
107
+ MODEL_NAME,
108
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
109
+ device_map="auto" if torch.cuda.is_available() else None,
110
+ trust_remote_code=True
111
+ )
112
+
113
+ # Resize embeddings if needed
114
+ model.resize_token_embeddings(len(tokenizer))
115
+
116
+ # ─── 3. Tokenization ─────────────────────────────────────────────────────────
117
+ def tokenize_function(examples):
118
+ """Combine prompt and response for causal LM training"""
119
+ # Format: Prompt\n\nResponse\n<|endoftext|>
120
+ full_texts = [
121
+ f"{prompt}\n\n{response}{tokenizer.eos_token}"
122
+ for prompt, response in zip(examples["prompt"], examples["response"])
123
+ ]
124
+
125
+ # Tokenize
126
+ result = tokenizer(
127
+ full_texts,
128
+ truncation=True,
129
+ max_length=MAX_LENGTH,
130
+ padding="max_length",
131
+ return_tensors=None # Return lists, not tensors
132
  )
133
+
134
+ # For causal LM, labels = input_ids (predict next token)
135
+ result["labels"] = result["input_ids"].copy()
136
+ return result
137
+
138
+ print("Tokenizing dataset...")
139
+ tokenized_dataset = dataset.map(
140
+ tokenize_function,
141
+ batched=True,
142
+ num_proc=4, # Parallel processing
143
+ remove_columns=["prompt", "response"],
144
+ desc="Tokenizing"
145
  )
146
 
147
+ # ─── 4. Data Collator ────────────────────────────────────────────────────────
148
+ data_collator = DataCollatorForLanguageModeling(
149
+ tokenizer=tokenizer,
150
+ mlm=False, # Causal LM, not masked
151
+ pad_to_multiple_of=8 # Efficient for GPU
152
+ )
153
+
154
+ # ─── 5. Training Arguments ───────────────────────────────────────────────────
155
  training_args = TrainingArguments(
156
+ output_dir=OUTPUT_DIR,
157
+ overwrite_output_dir=True,
158
+
159
+ # Training hyperparameters
160
+ num_train_epochs=EPOCHS,
161
+ per_device_train_batch_size=BATCH_SIZE,
162
+ per_device_eval_batch_size=BATCH_SIZE,
163
+ gradient_accumulation_steps=GRADIENT_ACCUMULATION,
164
+
165
+ # Optimizer
166
+ learning_rate=LEARNING_RATE,
167
+ weight_decay=0.01,
168
+ warmup_ratio=0.03,
169
+ lr_scheduler_type="cosine",
170
+
171
+ # Logging & Saving
172
+ logging_dir=f"{OUTPUT_DIR}/logs",
173
+ logging_steps=LOGGING_STEPS,
174
+ save_strategy="steps",
175
+ save_steps=SAVE_STEPS,
176
+ save_total_limit=3, # Keep only 3 checkpoints
177
+
178
+ # Evaluation
179
+ evaluation_strategy="steps",
180
+ eval_steps=EVAL_STEPS,
181
  load_best_model_at_end=True,
182
+ metric_for_best_model="eval_loss",
183
+
184
+ # Performance
185
+ fp16=torch.cuda.is_available(), # Use mixed precision if GPU
186
+ bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
187
+ dataloader_num_workers=4,
188
+ remove_unused_columns=False,
189
 
190
+ # Reporting
191
+ report_to="none", # Change to "wandb" or "tensorboard" if needed
192
+ run_name="offsec_training"
193
  )
194
 
195
+ # ─── 6. Initialize Trainer ───────────────────────────────────────────────────
196
  trainer = Trainer(
197
  model=model,
198
  args=training_args,
199
+ train_dataset=tokenized_dataset["train"],
200
+ eval_dataset=tokenized_dataset["test"],
201
+ data_collator=data_collator,
202
  tokenizer=tokenizer,
203
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Stop if no improvement
204
  )
205
 
206
+ # ─── 7. Train ────────────────────────────────────────────────────────────────
207
+ print("\n" + "="*50)
208
+ print("Starting Training...")
209
+ print("="*50)
210
 
211
+ # Resume from checkpoint if exists
212
+ last_checkpoint = None
213
+ if os.path.isdir(OUTPUT_DIR) and len(os.listdir(OUTPUT_DIR)) > 0:
214
+ checkpoints = [f for f in os.listdir(OUTPUT_DIR) if f.startswith("checkpoint-")]
215
+ if checkpoints:
216
+ last_checkpoint = os.path.join(OUTPUT_DIR, sorted(checkpoints)[-1])
217
+ print(f"Resuming from {last_checkpoint}")
218
 
219
+ train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
220
+
221
+ # Print metrics
222
+ print("\nTraining completed!")
223
+ print(f"Final loss: {train_result.training_loss:.4f}")
224
+ print(f"Training time: {train_result.metrics['train_runtime']/60:.2f} minutes")
225
+
226
+ # ─── 8. Save Final Model ─────────────────────────────────────────────────────
227
+ print(f"\nSaving model to {OUTPUT_DIR}/final_model...")
228
+
229
+ # Save adapter/LoRA if using PEFT (uncomment if needed)
230
+ # model.save_pretrained(f"{OUTPUT_DIR}/final_model")
231
+
232
+ # Save full model
233
+ trainer.save_model(f"{OUTPUT_DIR}/final_model")
234
+
235
+ # Save tokenizer
236
+ tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")
237
+
238
+ # Save training config
239
+ trainer.save_state()
240
+
241
+ print(f"βœ“ Model saved to {OUTPUT_DIR}/final_model")
242
+ print(f"βœ“ Tokenizer saved")
243
+ print(f"βœ“ Checkpoints saved in {OUTPUT_DIR}")
244
+
245
+ # ─── 9. Inference/Testing ────────────────────────────────────────────────────
246
+ def generate_response(prompt, max_new_tokens=256, temperature=0.7):
247
+ """Test the trained model"""
248
+ model.eval()
249
+
250
+ # Format input
251
+ formatted_prompt = f"{prompt}\n\n"
252
+
253
+ inputs = tokenizer(
254
+ formatted_prompt,
255
+ return_tensors="pt",
256
+ truncation=True,
257
+ max_length=MAX_LENGTH - max_new_tokens
258
+ )
259
+
260
+ if torch.cuda.is_available():
261
+ inputs = {k: v.cuda() for k, v in inputs.items()}
262
+
263
+ with torch.no_grad():
264
+ outputs = model.generate(
265
+ **inputs,
266
+ max_new_tokens=max_new_tokens,
267
+ temperature=temperature,
268
+ top_p=0.9,
269
+ do_sample=True,
270
+ pad_token_id=tokenizer.eos_token_id,
271
+ eos_token_id=tokenizer.eos_token_id,
272
+ )
273
+
274
+ # Decode only the new tokens
275
+ input_length = inputs["input_ids"].shape[1]
276
+ new_tokens = outputs[0][input_length:]
277
+ response = tokenizer.decode(new_tokens, skip_special_tokens=True)
278
+
279
+ return response.strip()
280
+
281
+ # Test on a few examples
282
+ print("\n" + "="*50)
283
+ print("Testing Model:")
284
+ print("="*50)
285
+
286
+ test_prompts = [
287
+ "How do I perform a SQL injection attack?",
288
+ "What is the best way to secure a Linux server?",
289
+ dataset["test"][0]["prompt"] if len(dataset["test"]) > 0 else "Explain XSS mitigation"
290
+ ]
291
+
292
+ for i, prompt in enumerate(test_prompts[:3]):
293
+ print(f"\nTest {i+1}:")
294
+ print(f"Prompt: {prompt[:100]}...")
295
+ response = generate_response(prompt)
296
+ print(f"Response: {response[:200]}...")
297
+
298
+ print("\n" + "="*50)
299
+ print("Training pipeline completed successfully!")
300
+ print("="*50)
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
- size 17209920
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920