File size: 2,714 Bytes
67c98a0
 
 
 
 
 
 
 
 
 
 
473ce62
67c98a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
import torch
import transformers
import datasets
import os
import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from huggingface_hub import login

# Hugging Face Authentication (Replace with your token)
HF_TOKEN = os.environ["HF_TOKEN"]
login(HF_TOKEN)

# Model and dataset
MODEL_NAME = "universeofml/DeepFocus-LLM-Privacy"
DATASETS = [
    "wikitext", "cnn_dailymail", "squad", "bookcorpus", "openwebtext", "common_voice"
]

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Function to load and preprocess dataset
def load_and_preprocess_dataset():
    dataset_list = []
    for dataset_name in DATASETS:
        dataset = load_dataset(dataset_name, split="train")
        dataset = dataset.map(lambda x: {"input_ids": tokenizer(x["text"], truncation=True, padding="max_length", max_length=512)["input_ids"]})
        dataset_list.append(dataset)
    return datasets.concatenate_datasets(dataset_list)

# Function to check for sensitive words
def filter_sensitive_words(text):
    SENSITIVE_WORDS = ["password", "social security", "credit card", "classified"]
    for word in SENSITIVE_WORDS:
        text = text.replace(word, "[REDACTED]")
    return text

# Load dataset
dataset = load_and_preprocess_dataset()

# Apply filtering
dataset = dataset.map(lambda x: {"input_ids": tokenizer(filter_sensitive_words(tokenizer.decode(x["input_ids"])), truncation=True, padding="max_length", max_length=512)["input_ids"]})

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Training arguments
training_args = TrainingArguments(
    output_dir="./deepfocus-llm",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    fp16=True,
    report_to="none",
    push_to_hub=True,
    hub_model_id="universeofml/DeepFocus-LLM-Privacy",
    hub_token=HF_TOKEN,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset.select(range(1000)),  # Small evaluation subset
    tokenizer=tokenizer,
)

# Gradio UI
def train_model():
    trainer.train()
    trainer.push_to_hub()
    return "Training Completed and Model Uploaded to Hugging Face!"

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Train DeepFocus-LLM-Privacy")
    btn = gr.Button("Start Training")
    output = gr.Textbox(label="Training Status")
    btn.click(train_model, outputs=output)

# Run Gradio app
if __name__ == "__main__":
    demo.launch()