Pranathi2612
/

SY

Model card Files Files and versions

SY / README.md

Pranathi2612's picture

Update README.md

257c574 verified 12 months ago

|

history blame contribute delete

2.63 kB

	---
	license: apache-2.0
	datasets:
	- fka/awesome-chatgpt-prompts
	language:
	- hi
	- ta
	- ml
	---
	import datasets
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
	from datasets import Dataset

	# Step 1: Define your colloquial dataset
	# Sample conversational data in different languages (adjust based on your task)
	data = {
	'text': [
	'kaise ho?', # informal Hindi greeting
	'kya scene hai?', # Hindi slang phrase
	'apne kahan jana hai?', # informal Hindi sentence
	'yentha vara', # Tamil slang
	'mizhhi pidichu', # Malayalam slang
	'enthu cheyyumo', # Malayalam slang
	'uru kuthi', # Tamil slang
	'ekdam mast', # Hindi slang
	],
	'label': [0, 1, 2, 3, 4, 4, 3, 1] # Example labels for intent or sentiment
	}

	# Step 2: Convert data into Hugging Face Dataset format
	dataset = Dataset.from_dict(data)

	# Step 3: Tokenize the data using a multilingual model tokenizer
	tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

	# Tokenization function
	def tokenize_function(examples):
	return tokenizer(examples['text'], padding="max_length", truncation=True)

	# Apply tokenization to the dataset
	dataset = dataset.map(tokenize_function, batched=True)

	# Step 4: Load a pre-trained model for sequence classification
	model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=5)

	# Step 5: Set up Trainer for fine-tuning the model
	training_args = TrainingArguments(
	output_dir='./results', # Output directory to save model and logs
	evaluation_strategy="epoch", # Evaluate after each epoch
	per_device_train_batch_size=8, # Batch size during training
	per_device_eval_batch_size=8, # Batch size during evaluation
	num_train_epochs=3, # Number of epochs for training
	logging_dir='./logs', # Log directory for training details
	logging_steps=10, # Number of steps to log
	)

	# Initialize the Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset,
	eval_dataset=dataset, # Typically, split dataset into training and validation sets
	)

	# Step 6: Train the model
	trainer.train()

	# Step 7: Save the trained model and tokenizer
	model.save_pretrained("./my_colloquial_model")
	tokenizer.save_pretrained("./my_colloquial_model")

	# Optional: Upload to Hugging Face
	# Uncomment and use Hugging Face CLI to upload the model:
	# !huggingface-cli login # Log in to your Hugging Face account
	# model.push_to_hub("my_colloquial_model")

	print("Model training and saving complete.")