|
|
--- |
|
|
license: apache-2.0 |
|
|
datasets: |
|
|
- fka/awesome-chatgpt-prompts |
|
|
language: |
|
|
- hi |
|
|
- ta |
|
|
- ml |
|
|
--- |
|
|
import datasets |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments |
|
|
from datasets import Dataset |
|
|
|
|
|
# Step 1: Define your colloquial dataset |
|
|
# Sample conversational data in different languages (adjust based on your task) |
|
|
data = { |
|
|
'text': [ |
|
|
'kaise ho?', # informal Hindi greeting |
|
|
'kya scene hai?', # Hindi slang phrase |
|
|
'apne kahan jana hai?', # informal Hindi sentence |
|
|
'yentha vara', # Tamil slang |
|
|
'mizhhi pidichu', # Malayalam slang |
|
|
'enthu cheyyumo', # Malayalam slang |
|
|
'uru kuthi', # Tamil slang |
|
|
'ekdam mast', # Hindi slang |
|
|
], |
|
|
'label': [0, 1, 2, 3, 4, 4, 3, 1] # Example labels for intent or sentiment |
|
|
} |
|
|
|
|
|
# Step 2: Convert data into Hugging Face Dataset format |
|
|
dataset = Dataset.from_dict(data) |
|
|
|
|
|
# Step 3: Tokenize the data using a multilingual model tokenizer |
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") |
|
|
|
|
|
# Tokenization function |
|
|
def tokenize_function(examples): |
|
|
return tokenizer(examples['text'], padding="max_length", truncation=True) |
|
|
|
|
|
# Apply tokenization to the dataset |
|
|
dataset = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
# Step 4: Load a pre-trained model for sequence classification |
|
|
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=5) |
|
|
|
|
|
# Step 5: Set up Trainer for fine-tuning the model |
|
|
training_args = TrainingArguments( |
|
|
output_dir='./results', # Output directory to save model and logs |
|
|
evaluation_strategy="epoch", # Evaluate after each epoch |
|
|
per_device_train_batch_size=8, # Batch size during training |
|
|
per_device_eval_batch_size=8, # Batch size during evaluation |
|
|
num_train_epochs=3, # Number of epochs for training |
|
|
logging_dir='./logs', # Log directory for training details |
|
|
logging_steps=10, # Number of steps to log |
|
|
) |
|
|
|
|
|
# Initialize the Trainer |
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=dataset, |
|
|
eval_dataset=dataset, # Typically, split dataset into training and validation sets |
|
|
) |
|
|
|
|
|
# Step 6: Train the model |
|
|
trainer.train() |
|
|
|
|
|
# Step 7: Save the trained model and tokenizer |
|
|
model.save_pretrained("./my_colloquial_model") |
|
|
tokenizer.save_pretrained("./my_colloquial_model") |
|
|
|
|
|
# Optional: Upload to Hugging Face |
|
|
# Uncomment and use Hugging Face CLI to upload the model: |
|
|
# !huggingface-cli login # Log in to your Hugging Face account |
|
|
# model.push_to_hub("my_colloquial_model") |
|
|
|
|
|
print("Model training and saving complete.") |