Pranathi2612
/

SY

+---
+license: apache-2.0
+datasets:
+- fka/awesome-chatgpt-prompts
+language:
+- hi
+- ta
+- ml
+---
+import datasets
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+from datasets import Dataset
+# Step 1: Define your colloquial dataset
+# Sample conversational data in different languages (adjust based on your task)
+data = {
+    'text': [
+        'kaise ho?',  # informal Hindi greeting
+        'kya scene hai?',  # Hindi slang phrase
+        'apne kahan jana hai?',  # informal Hindi sentence
+        'yentha vara',  # Tamil slang
+        'mizhhi pidichu',  # Malayalam slang
+        'enthu cheyyumo',  # Malayalam slang
+        'uru kuthi',  # Tamil slang
+        'ekdam mast',  # Hindi slang
+    ],
+    'label': [0, 1, 2, 3, 4, 4, 3, 1]  # Example labels for intent or sentiment
+}
+# Step 2: Convert data into Hugging Face Dataset format
+dataset = Dataset.from_dict(data)
+# Step 3: Tokenize the data using a multilingual model tokenizer
+tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
+# Tokenization function
+def tokenize_function(examples):
+    return tokenizer(examples['text'], padding="max_length", truncation=True)
+# Apply tokenization to the dataset
+dataset = dataset.map(tokenize_function, batched=True)
+# Step 4: Load a pre-trained model for sequence classification
+model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=5)
+# Step 5: Set up Trainer for fine-tuning the model
+training_args = TrainingArguments(
+    output_dir='./results',  # Output directory to save model and logs
+    evaluation_strategy="epoch",  # Evaluate after each epoch
+    per_device_train_batch_size=8,  # Batch size during training
+    per_device_eval_batch_size=8,  # Batch size during evaluation
+    num_train_epochs=3,  # Number of epochs for training
+    logging_dir='./logs',  # Log directory for training details
+    logging_steps=10,  # Number of steps to log
+)
+# Initialize the Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    eval_dataset=dataset,  # Typically, split dataset into training and validation sets
+)
+# Step 6: Train the model
+trainer.train()
+# Step 7: Save the trained model and tokenizer
+model.save_pretrained("./my_colloquial_model")
+tokenizer.save_pretrained("./my_colloquial_model")
+# Optional: Upload to Hugging Face
+# Uncomment and use Hugging Face CLI to upload the model:
+# !huggingface-cli login  # Log in to your Hugging Face account
+# model.push_to_hub("my_colloquial_model")
+print("Model training and saving complete.")