# Import necessary libraries from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments # Load your dataset (assuming you uploaded it to Hugging Face) dataset = load_dataset("romanurdu_dataset") # Load pre-trained mBERT tokenizer and model tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased') model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2) # Tokenize the dataset (adjust based on your dataset's structure) def tokenize_function(examples): return tokenizer(examples['Text'], padding="max_length", truncation=True) # Tokenize the datasets tokenized_datasets = dataset.map(tokenize_function, batched=True) # Split into train and test datasets (if not already split) train_dataset = tokenized_datasets['train'] test_dataset = tokenized_datasets['test'] # Define training arguments training_args = TrainingArguments( output_dir='./results', # output directory for model checkpoints evaluation_strategy="epoch", # evaluate after each epoch learning_rate=2e-5, # learning rate per_device_train_batch_size=16, # batch size for training per_device_eval_batch_size=64, # batch size for evaluation num_train_epochs=3, # number of epochs weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory to store logs ) # Initialize Trainer trainer = Trainer( model=model, # the model to be trained args=training_args, # training arguments train_dataset=train_dataset, # training dataset eval_dataset=test_dataset # evaluation dataset ) # Train the model trainer.train() # Save the model to Hugging Face Model Hub model.push_to_hub("SentimentAnalysisRomanUrdu") tokenizer.push_to_hub("SentimentAnalysisRomanUrdu")