import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments from datasets import load_dataset # Define datasets and their IDs datasets_info = { "SQuAD": "squad", "SQuAD 2.0": "squad_v2", "Natural Questions": "nq", "TriviaQA": "triviaqa", "QuAC": "quac", "FAQ Dataset": "faq", "BoolQ": "boolq", "Open Book QA": "obqa" } # Load model and tokenizer directly tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF") model = AutoModelForCausalLM.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF") def train_model(dataset_name): # Load the dataset dataset = load_dataset(datasets_info[dataset_name]) # Tokenization def preprocess_function(examples): return tokenizer(examples['question'], examples['context'], truncation=True) tokenized_dataset = dataset.map(preprocess_function, batched=True) # Fine-tune the model training_args = TrainingArguments( output_dir=f"./{dataset_name}_model", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, logging_dir='./logs', ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['validation'] ) trainer.train() # Save the model weights model.save_pretrained(f"./{dataset_name}_model") tokenizer.save_pretrained(f"./{dataset_name}_model") return f"Model trained and saved for {dataset_name}!" # Gradio Interface with gr.Blocks() as demo: gr.Markdown("## Train QA Model on Multiple Datasets") dataset_name = gr.Dropdown(choices=list(datasets_info.keys()), label="Select Dataset") train_button = gr.Button("Train Model") output = gr.Textbox(label="Output") def train_and_display(dataset_name): return train_model(dataset_name) train_button.click(train_and_display, inputs=dataset_name, outputs=output) demo.launch()