Spaces:
No application file
No application file
File size: 1,961 Bytes
a33f032 68ef9e6 a33f032 68ef9e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | # Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# Load your dataset (assuming you uploaded it to Hugging Face)
dataset = load_dataset("romanurdu_dataset")
# Load pre-trained mBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
# Tokenize the dataset (adjust based on your dataset's structure)
def tokenize_function(examples):
return tokenizer(examples['Text'], padding="max_length", truncation=True)
# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Split into train and test datasets (if not already split)
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']
# Define training arguments
training_args = TrainingArguments(
output_dir='./results', # output directory for model checkpoints
evaluation_strategy="epoch", # evaluate after each epoch
learning_rate=2e-5, # learning rate
per_device_train_batch_size=16, # batch size for training
per_device_eval_batch_size=64, # batch size for evaluation
num_train_epochs=3, # number of epochs
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory to store logs
)
# Initialize Trainer
trainer = Trainer(
model=model, # the model to be trained
args=training_args, # training arguments
train_dataset=train_dataset, # training dataset
eval_dataset=test_dataset # evaluation dataset
)
# Train the model
trainer.train()
# Save the model to Hugging Face Model Hub
model.push_to_hub("SentimentAnalysisRomanUrdu")
tokenizer.push_to_hub("SentimentAnalysisRomanUrdu")
|