Spaces:
No application file
No application file
| # Import necessary libraries | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
| # Load your dataset (assuming you uploaded it to Hugging Face) | |
| dataset = load_dataset("romanurdu_dataset") | |
| # Load pre-trained mBERT tokenizer and model | |
| tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased') | |
| model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2) | |
| # Tokenize the dataset (adjust based on your dataset's structure) | |
| def tokenize_function(examples): | |
| return tokenizer(examples['Text'], padding="max_length", truncation=True) | |
| # Tokenize the datasets | |
| tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
| # Split into train and test datasets (if not already split) | |
| train_dataset = tokenized_datasets['train'] | |
| test_dataset = tokenized_datasets['test'] | |
| # Define training arguments | |
| training_args = TrainingArguments( | |
| output_dir='./results', # output directory for model checkpoints | |
| evaluation_strategy="epoch", # evaluate after each epoch | |
| learning_rate=2e-5, # learning rate | |
| per_device_train_batch_size=16, # batch size for training | |
| per_device_eval_batch_size=64, # batch size for evaluation | |
| num_train_epochs=3, # number of epochs | |
| weight_decay=0.01, # strength of weight decay | |
| logging_dir='./logs', # directory to store logs | |
| ) | |
| # Initialize Trainer | |
| trainer = Trainer( | |
| model=model, # the model to be trained | |
| args=training_args, # training arguments | |
| train_dataset=train_dataset, # training dataset | |
| eval_dataset=test_dataset # evaluation dataset | |
| ) | |
| # Train the model | |
| trainer.train() | |
| # Save the model to Hugging Face Model Hub | |
| model.push_to_hub("SentimentAnalysisRomanUrdu") | |
| tokenizer.push_to_hub("SentimentAnalysisRomanUrdu") | |