katsukiai's picture
Update app.py
473ce62 verified
import gradio as gr
import torch
import transformers
import datasets
import os
import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from huggingface_hub import login
# Hugging Face Authentication (Replace with your token)
HF_TOKEN = os.environ["HF_TOKEN"]
login(HF_TOKEN)
# Model and dataset
MODEL_NAME = "universeofml/DeepFocus-LLM-Privacy"
DATASETS = [
"wikitext", "cnn_dailymail", "squad", "bookcorpus", "openwebtext", "common_voice"
]
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Function to load and preprocess dataset
def load_and_preprocess_dataset():
dataset_list = []
for dataset_name in DATASETS:
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.map(lambda x: {"input_ids": tokenizer(x["text"], truncation=True, padding="max_length", max_length=512)["input_ids"]})
dataset_list.append(dataset)
return datasets.concatenate_datasets(dataset_list)
# Function to check for sensitive words
def filter_sensitive_words(text):
SENSITIVE_WORDS = ["password", "social security", "credit card", "classified"]
for word in SENSITIVE_WORDS:
text = text.replace(word, "[REDACTED]")
return text
# Load dataset
dataset = load_and_preprocess_dataset()
# Apply filtering
dataset = dataset.map(lambda x: {"input_ids": tokenizer(filter_sensitive_words(tokenizer.decode(x["input_ids"])), truncation=True, padding="max_length", max_length=512)["input_ids"]})
# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Training arguments
training_args = TrainingArguments(
output_dir="./deepfocus-llm",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_dir="./logs",
logging_steps=100,
save_total_limit=2,
fp16=True,
report_to="none",
push_to_hub=True,
hub_model_id="universeofml/DeepFocus-LLM-Privacy",
hub_token=HF_TOKEN,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
eval_dataset=dataset.select(range(1000)), # Small evaluation subset
tokenizer=tokenizer,
)
# Gradio UI
def train_model():
trainer.train()
trainer.push_to_hub()
return "Training Completed and Model Uploaded to Hugging Face!"
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Train DeepFocus-LLM-Privacy")
btn = gr.Button("Start Training")
output = gr.Textbox(label="Training Status")
btn.click(train_model, outputs=output)
# Run Gradio app
if __name__ == "__main__":
demo.launch()