zxc4wewewe
/

blackthinking

@@ -1,47 +1,47 @@
----
-base_model:
-- Novaciano/Eurinoferus-3.2-1B
-- cazzz307/Abliterated-Llama-3.2-1B-Instruct
-library_name: transformers
-tags:
-- mergekit
-- merge
----
-# merge
-This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
-## Merge Details
-### Merge Method
-This model was merged using the [Arcee Fusion](https://arcee.ai) merge method using [Novaciano/Eurinoferus-3.2-1B](https://huggingface.co/Novaciano/Eurinoferus-3.2-1B) as a base.
-### Models Merged
-The following models were included in the merge:
-* [cazzz307/Abliterated-Llama-3.2-1B-Instruct](https://huggingface.co/cazzz307/Abliterated-Llama-3.2-1B-Instruct)
-### Configuration
-The following YAML configuration was used to produce this model:
-```yaml
-dtype: float32
-out_dtype: bfloat16
-merge_method: arcee_fusion
-base_model: Novaciano/Eurinoferus-3.2-1B
-models:
-    - model: Novaciano/Eurinoferus-3.2-1B
-      parameters:
-        weight:
-          - filter: mlp
-            value: [1, 2]
-          - value: 1
-    - model: cazzz307/Abliterated-Llama-3.2-1B-Instruct
-      parameters:
-        weight:
-          - filter: lm_head
-            value: 1
-          - value: [1, 0.5]
-```

+---
+base_model:
+- Novaciano/Eurinoferus-3.2-1B
+- cazzz307/Abliterated-Llama-3.2-1B-Instruct
+library_name: transformers
+tags:
+- mergekit
+- merge
+---
+# merge
+This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
+## Merge Details
+### Merge Method
+This model was merged using the [Arcee Fusion](https://arcee.ai) merge method using [Novaciano/Eurinoferus-3.2-1B](https://huggingface.co/Novaciano/Eurinoferus-3.2-1B) as a base.
+### Models Merged
+The following models were included in the merge:
+* [cazzz307/Abliterated-Llama-3.2-1B-Instruct](https://huggingface.co/cazzz307/Abliterated-Llama-3.2-1B-Instruct)
+### Configuration
+The following YAML configuration was used to produce this model:
+```yaml
+dtype: float32
+out_dtype: bfloat16
+merge_method: arcee_fusion
+base_model: Novaciano/Eurinoferus-3.2-1B
+models:
+    - model: Novaciano/Eurinoferus-3.2-1B
+      parameters:
+        weight:
+          - filter: mlp
+            value: [1, 2]
+          - value: 1
+    - model: cazzz307/Abliterated-Llama-3.2-1B-Instruct
+      parameters:
+        weight:
+          - filter: lm_head
+            value: 1
+          - value: [1, 0.5]
+```

app.py CHANGED Viewed

@@ -1,92 +1,300 @@
 from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    TrainingArguments,
-    Trainer
 )
-from datasets import load_dataset
-import torch
-# 1. Load dataset
-dataset = load_dataset("zxc4wewewe/offsec")
-# 2. Add labels (required for classification)
-# Modify based on your actual classification task:
-def add_labels(example):
-    # Example: Classify if prompt is malicious (1) or benign (0)
-    # Replace this logic with your actual labels!
-    malicious_keywords = ['hack', 'exploit', 'crack', 'bypass', 'inject']
-    text_lower = example["prompt"].lower()
-    example["labels"] = 1 if any(kw in text_lower for kw in malicious_keywords) else 0
-    return example
-dataset = dataset.map(add_labels)
-# 3. Load Tokenizer
-tokenizer = AutoTokenizer.from_pretrained("zxc4wewewe/blackthinking")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
-# 4. Tokenize dataset
-def tokenize_function(batch):
-    tokenized = tokenizer(
-        batch["prompt"],
-        padding=True,
-        truncation=True,
-        max_length=512
     )
-    tokenized["labels"] = batch["labels"]
-    return tokenized
-dataset = dataset.map(tokenize_function, batched=True)
-dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
-# 5. Load Model with SafeTensors support
-model = AutoModelForSequenceClassification.from_pretrained(
-    "zxc4wewewe/blackthinking",
-    num_labels=2,
-    torch_dtype=torch.float16,  # Optional: saves memory
-    use_safetensors=True        # Force SafeTensors loading
 )
-# 6. Training Arguments with SafeTensors saving
 training_args = TrainingArguments(
-    output_dir="./safetensors_results",
-    num_train_epochs=3,
-    per_device_train_batch_size=4,
-    gradient_accumulation_steps=2,
-    learning_rate=2e-5,
-    logging_steps=10,
-    save_strategy="epoch",
-    # SafeTensors Configuration
-    save_safetensors=True,      # Save as .safetensors (not .bin)
     load_best_model_at_end=True,
-    # Optional optimizations
-    fp16=torch.cuda.is_available(),  # Use FP16 if GPU available
-    report_to="none"
 )
-# 7. Initialize Trainer
 trainer = Trainer(
     model=model,
     args=training_args,
-    train_dataset=dataset["train"].shuffle(seed=42).select(range(1000)),
-    eval_dataset=dataset["test"].shuffle(seed=42).select(range(200)) if "test" in dataset else None,
     tokenizer=tokenizer,
 )
-# 8. Train and Save
-print("Starting training with SafeTensors format...")
-trainer.train()
-# Save final model in SafeTensors format
-trainer.save_model("./final_safetensors_model")
-print("Model saved in SafeTensors format!")
-# 9. Verification - Check files
-import os
-model_path = "./final_safetensors_model"
-files = os.listdir(model_path)
-print("Saved files:", [f for f in files if f.endswith(('.safetensors', '.json', '.txt'))])

+import os
+import torch
+from datasets import load_dataset, Dataset, DatasetDict
 from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+    EarlyStoppingCallback
 )
+import shutil
+# ─── Configuration ───────────────────────────────────────────────────────────
+MODEL_NAME = "zxc4wewewe/blackthinking"  # Your base model
+OUTPUT_DIR = "./offsec_model"
+MAX_LENGTH = 512
+BATCH_SIZE = 4  # Adjust based on your VRAM
+GRADIENT_ACCUMULATION = 4  # Effective batch = 16
+EPOCHS = 3
+LEARNING_RATE = 2e-5
+SAVE_STEPS = 500
+EVAL_STEPS = 500
+LOGGING_STEPS = 50
+# ─── 1. Load Dataset with Schema Handling ────────────────────────────────────
+def load_and_fix_dataset():
+    """Load dataset handling both 'messages' and 'prompt/response' formats"""
+    cache_dir = os.path.expanduser("~/.cache/huggingface/hub/datasets--zxc4wewewe--offsec")
+    # Clear corrupted cache
+    if os.path.exists(cache_dir):
+        shutil.rmtree(cache_dir)
+    try:
+        # Try loading specific files first (avoid training-data-sample.parquet)
+        dataset = load_dataset("arcee-ai/LLama-405B-Logits")
+    except Exception as e:
+        print(f"Specific file load failed: {e}")
+        print("Trying generic load...")
+        dataset = load_dataset("zxc4wewewe/offsec")
+    # ─── Schema Normalization ────────────────────────────────────────────────
+    def normalize_example(example):
+        """Convert any format to prompt/response"""
+        # If already has prompt/response, return as-is
+        if "prompt" in example and "response" in example:
+            return {
+                "prompt": str(example["prompt"]) if example["prompt"] is not None else "",
+                "response": str(example["response"]) if example["response"] is not None else ""
+            }
+        # If has messages (chat format), convert
+        if "messages" in example and isinstance(example["messages"], list):
+            messages = example["messages"]
+            prompt = ""
+            response = ""
+            for msg in messages:
+                if isinstance(msg, dict):
+                    role = msg.get("role", "")
+                    content = msg.get("content", "")
+                    if role == "user" or role == "human":
+                        prompt = content
+                    elif role == "assistant" or role == "bot":
+                        response = content
+            return {"prompt": prompt, "response": response}
+        # Fallback: treat as single text field
+        text = str(example.get("text", example.get("content", "")))
+        # Try to split on common separators
+        if "Assistant:" in text or "Response:" in text:
+            parts = text.split("Assistant:", 1) if "Assistant:" in text else text.split("Response:", 1)
+            return {
+                "prompt": parts[0].replace("User:", "").strip(),
+                "response": parts[1].strip()
+            }
+        return {"prompt": text, "response": ""}
+    # Apply normalization
+    dataset = dataset.map(normalize_example, remove_columns=dataset["train"].column_names)
+    # Filter out empty examples
+    dataset = dataset.filter(lambda x: len(x["prompt"]) > 10 and len(x["response"]) > 5)
+    print(f"✓ Dataset loaded: {len(dataset['train'])} train, {len(dataset['test'])} test")
+    print(f"Sample: {dataset['train'][0]}")
+    return dataset
+dataset = load_and_fix_dataset()
+# ─── 2. Tokenizer & Model Setup ─────────────────────────────────────────────
+print(f"\nLoading tokenizer and model: {MODEL_NAME}")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+# Fix padding token for causal LM
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto" if torch.cuda.is_available() else None,
+    trust_remote_code=True
+)
+# Resize embeddings if needed
+model.resize_token_embeddings(len(tokenizer))
+# ─── 3. Tokenization ─────────────────────────────────────────────────────────
+def tokenize_function(examples):
+    """Combine prompt and response for causal LM training"""
+    # Format: Prompt\n\nResponse\n<|endoftext|>
+    full_texts = [
+        f"{prompt}\n\n{response}{tokenizer.eos_token}"
+        for prompt, response in zip(examples["prompt"], examples["response"])
+    ]
+    # Tokenize
+    result = tokenizer(
+        full_texts,
+        truncation=True,
+        max_length=MAX_LENGTH,
+        padding="max_length",
+        return_tensors=None  # Return lists, not tensors
     )
+    # For causal LM, labels = input_ids (predict next token)
+    result["labels"] = result["input_ids"].copy()
+    return result
+print("Tokenizing dataset...")
+tokenized_dataset = dataset.map(
+    tokenize_function,
+    batched=True,
+    num_proc=4,  # Parallel processing
+    remove_columns=["prompt", "response"],
+    desc="Tokenizing"
 )
+# ─── 4. Data Collator ────────────────────────────────────────────────────────
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm=False,  # Causal LM, not masked
+    pad_to_multiple_of=8  # Efficient for GPU
+)
+# ─── 5. Training Arguments ───────────────────────────────────────────────────
 training_args = TrainingArguments(
+    output_dir=OUTPUT_DIR,
+    overwrite_output_dir=True,
+    # Training hyperparameters
+    num_train_epochs=EPOCHS,
+    per_device_train_batch_size=BATCH_SIZE,
+    per_device_eval_batch_size=BATCH_SIZE,
+    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
+    # Optimizer
+    learning_rate=LEARNING_RATE,
+    weight_decay=0.01,
+    warmup_ratio=0.03,
+    lr_scheduler_type="cosine",
+    # Logging & Saving
+    logging_dir=f"{OUTPUT_DIR}/logs",
+    logging_steps=LOGGING_STEPS,
+    save_strategy="steps",
+    save_steps=SAVE_STEPS,
+    save_total_limit=3,  # Keep only 3 checkpoints
+    # Evaluation
+    evaluation_strategy="steps",
+    eval_steps=EVAL_STEPS,
     load_best_model_at_end=True,
+    metric_for_best_model="eval_loss",
+    # Performance
+    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU
+    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
+    dataloader_num_workers=4,
+    remove_unused_columns=False,
+    # Reporting
+    report_to="none",  # Change to "wandb" or "tensorboard" if needed
+    run_name="offsec_training"
 )
+# ─── 6. Initialize Trainer ───────────────────────────────────────────────────
 trainer = Trainer(
     model=model,
     args=training_args,
+    train_dataset=tokenized_dataset["train"],
+    eval_dataset=tokenized_dataset["test"],
+    data_collator=data_collator,
     tokenizer=tokenizer,
+    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop if no improvement
 )
+# ─── 7. Train ────────────────────────────────────────────────────────────────
+print("\n" + "="*50)
+print("Starting Training...")
+print("="*50)
+# Resume from checkpoint if exists
+last_checkpoint = None
+if os.path.isdir(OUTPUT_DIR) and len(os.listdir(OUTPUT_DIR)) > 0:
+    checkpoints = [f for f in os.listdir(OUTPUT_DIR) if f.startswith("checkpoint-")]
+    if checkpoints:
+        last_checkpoint = os.path.join(OUTPUT_DIR, sorted(checkpoints)[-1])
+        print(f"Resuming from {last_checkpoint}")
+train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
+# Print metrics
+print("\nTraining completed!")
+print(f"Final loss: {train_result.training_loss:.4f}")
+print(f"Training time: {train_result.metrics['train_runtime']/60:.2f} minutes")
+# ─── 8. Save Final Model ─────────────────────────────────────────────────────
+print(f"\nSaving model to {OUTPUT_DIR}/final_model...")
+# Save adapter/LoRA if using PEFT (uncomment if needed)
+# model.save_pretrained(f"{OUTPUT_DIR}/final_model")
+# Save full model
+trainer.save_model(f"{OUTPUT_DIR}/final_model")
+# Save tokenizer
+tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")
+# Save training config
+trainer.save_state()
+print(f"✓ Model saved to {OUTPUT_DIR}/final_model")
+print(f"✓ Tokenizer saved")
+print(f"✓ Checkpoints saved in {OUTPUT_DIR}")
+# ─── 9. Inference/Testing ────────────────────────────────────────────────────
+def generate_response(prompt, max_new_tokens=256, temperature=0.7):
+    """Test the trained model"""
+    model.eval()
+    # Format input
+    formatted_prompt = f"{prompt}\n\n"
+    inputs = tokenizer(
+        formatted_prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_LENGTH - max_new_tokens
+    )
+    if torch.cuda.is_available():
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=0.9,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    # Decode only the new tokens
+    input_length = inputs["input_ids"].shape[1]
+    new_tokens = outputs[0][input_length:]
+    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    return response.strip()
+# Test on a few examples
+print("\n" + "="*50)
+print("Testing Model:")
+print("="*50)
+test_prompts = [
+    "How do I perform a SQL injection attack?",
+    "What is the best way to secure a Linux server?",
+    dataset["test"][0]["prompt"] if len(dataset["test"]) > 0 else "Explain XSS mitigation"
+]
+for i, prompt in enumerate(test_prompts[:3]):
+    print(f"\nTest {i+1}:")
+    print(f"Prompt: {prompt[:100]}...")
+    response = generate_response(prompt)
+    print(f"Response: {response[:200]}...")
+print("\n" + "="*50)
+print("Training pipeline completed successfully!")
+print("="*50)

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
-size 17209920

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920