Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import transformers | |
| import datasets | |
| import os | |
| import tqdm | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer | |
| from datasets import load_dataset | |
| from huggingface_hub import login | |
| # Hugging Face Authentication (Replace with your token) | |
| HF_TOKEN = os.environ["HF_TOKEN"] | |
| login(HF_TOKEN) | |
| # Model and dataset | |
| MODEL_NAME = "universeofml/DeepFocus-LLM-Privacy" | |
| DATASETS = [ | |
| "wikitext", "cnn_dailymail", "squad", "bookcorpus", "openwebtext", "common_voice" | |
| ] | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # Function to load and preprocess dataset | |
| def load_and_preprocess_dataset(): | |
| dataset_list = [] | |
| for dataset_name in DATASETS: | |
| dataset = load_dataset(dataset_name, split="train") | |
| dataset = dataset.map(lambda x: {"input_ids": tokenizer(x["text"], truncation=True, padding="max_length", max_length=512)["input_ids"]}) | |
| dataset_list.append(dataset) | |
| return datasets.concatenate_datasets(dataset_list) | |
| # Function to check for sensitive words | |
| def filter_sensitive_words(text): | |
| SENSITIVE_WORDS = ["password", "social security", "credit card", "classified"] | |
| for word in SENSITIVE_WORDS: | |
| text = text.replace(word, "[REDACTED]") | |
| return text | |
| # Load dataset | |
| dataset = load_and_preprocess_dataset() | |
| # Apply filtering | |
| dataset = dataset.map(lambda x: {"input_ids": tokenizer(filter_sensitive_words(tokenizer.decode(x["input_ids"])), truncation=True, padding="max_length", max_length=512)["input_ids"]}) | |
| # Load model | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./deepfocus-llm", | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| gradient_accumulation_steps=4, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| logging_dir="./logs", | |
| logging_steps=100, | |
| save_total_limit=2, | |
| fp16=True, | |
| report_to="none", | |
| push_to_hub=True, | |
| hub_model_id="universeofml/DeepFocus-LLM-Privacy", | |
| hub_token=HF_TOKEN, | |
| ) | |
| # Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=dataset, | |
| eval_dataset=dataset.select(range(1000)), # Small evaluation subset | |
| tokenizer=tokenizer, | |
| ) | |
| # Gradio UI | |
| def train_model(): | |
| trainer.train() | |
| trainer.push_to_hub() | |
| return "Training Completed and Model Uploaded to Hugging Face!" | |
| # Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Train DeepFocus-LLM-Privacy") | |
| btn = gr.Button("Start Training") | |
| output = gr.Textbox(label="Training Status") | |
| btn.click(train_model, outputs=output) | |
| # Run Gradio app | |
| if __name__ == "__main__": | |
| demo.launch() |