import gradio as gr import torch import transformers import datasets import os import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer from datasets import load_dataset from huggingface_hub import login # Hugging Face Authentication (Replace with your token) HF_TOKEN = os.environ["HF_TOKEN"] login(HF_TOKEN) # Model and dataset MODEL_NAME = "universeofml/DeepFocus-LLM-Privacy" DATASETS = [ "wikitext", "cnn_dailymail", "squad", "bookcorpus", "openwebtext", "common_voice" ] # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Function to load and preprocess dataset def load_and_preprocess_dataset(): dataset_list = [] for dataset_name in DATASETS: dataset = load_dataset(dataset_name, split="train") dataset = dataset.map(lambda x: {"input_ids": tokenizer(x["text"], truncation=True, padding="max_length", max_length=512)["input_ids"]}) dataset_list.append(dataset) return datasets.concatenate_datasets(dataset_list) # Function to check for sensitive words def filter_sensitive_words(text): SENSITIVE_WORDS = ["password", "social security", "credit card", "classified"] for word in SENSITIVE_WORDS: text = text.replace(word, "[REDACTED]") return text # Load dataset dataset = load_and_preprocess_dataset() # Apply filtering dataset = dataset.map(lambda x: {"input_ids": tokenizer(filter_sensitive_words(tokenizer.decode(x["input_ids"])), truncation=True, padding="max_length", max_length=512)["input_ids"]}) # Load model model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) # Training arguments training_args = TrainingArguments( output_dir="./deepfocus-llm", per_device_train_batch_size=4, per_device_eval_batch_size=4, gradient_accumulation_steps=4, evaluation_strategy="epoch", save_strategy="epoch", logging_dir="./logs", logging_steps=100, save_total_limit=2, fp16=True, report_to="none", push_to_hub=True, hub_model_id="universeofml/DeepFocus-LLM-Privacy", hub_token=HF_TOKEN, ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset, eval_dataset=dataset.select(range(1000)), # Small evaluation subset tokenizer=tokenizer, ) # Gradio UI def train_model(): trainer.train() trainer.push_to_hub() return "Training Completed and Model Uploaded to Hugging Face!" # Gradio interface with gr.Blocks() as demo: gr.Markdown("# Train DeepFocus-LLM-Privacy") btn = gr.Button("Start Training") output = gr.Textbox(label="Training Status") btn.click(train_model, outputs=output) # Run Gradio app if __name__ == "__main__": demo.launch()