File size: 2,034 Bytes
ee63cc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments
)
from datasets import load_dataset
import pandas as pd
import os

def train_on_devign(base_model="microsoft/codebert-base", output_dir="./trained_model"):
    print(f"🚀 Initializing Autotrain Engine for {base_model}")
    
    # 1. Load specialized Devign dataset
    print("📥 Loading Devign dataset from Hugging Face Hub...")
    try:
        dataset = load_dataset("DetectVul/devign")
    except Exception as e:
        print(f"Failed to load Devign: {e}. Falling back to sample dataset.")
        return

    tokenizer = AutoTokenizer.from_pretrained(base_model)

    def tokenize_function(examples):
        return tokenizer(examples["func"], padding="max_length", truncation=True, max_length=512)

    print("✂️ Tokenizing dataset...")
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # 2. Load Model
    print("🧠 Loading Base Model...")
    model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)

    # 3. Setup Training
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8, # Optimized for high-performance
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False,
        logging_dir='./logs',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
    )

    # 4. Train
    print("🔥 Starting Fine-tuning cycle...")
    trainer.train()

    # 5. Save & Update
    print(f"✅ Training Complete. Saving to {output_dir}")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

if __name__ == "__main__":
    # In a real scenario, this would be triggered by /train
    train_on_devign()