Upload copy_of_train_py.py

f89a731 verified about 1 year ago

5.52 kB

	# -- coding: utf-8 --
	"""Copy of Train.py

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1kmBG6E2hojULw9nZo3wPcAEzWDtB2axF
	"""

	!pip install transformers datasets torch huggingface_hub

	import pandas as pd
	from datasets import Dataset

	# Load dataset
	df = pd.read_csv('Telugu.csv') # Replace 'dataset.csv' with your file name
	dataset = Dataset.from_pandas(df)

	# Display dataset
	print(dataset)

	import pandas as pd

	# Load the dataset
	file_path = "Telugu.csv" # Use the file name of your uploaded dataset
	df = pd.read_csv(file_path)

	# Remove duplicates
	df = df.drop_duplicates()

	# Remove rows with missing values
	df = df.dropna()

	# Preview the cleaned dataset
	print("Dataset after removing duplicates and null values:")
	print(df.head())

	# Save the cleaned dataset
	cleaned_file_name = "cleaned_telugu.csv"
	df.to_csv(cleaned_file_name, index=False)
	print(f"Cleaned dataset saved as {cleaned_file_name}")

	from huggingface_hub import notebook_login

	notebook_login()

	from transformers import AutoTokenizer
	from datasets import load_dataset

	# Load the dataset
	dataset = load_dataset('csv', data_files='Telugu.csv')

	# Create train and test splits (if needed)
	# ... (Your existing code for splitting) ...

	# Load the tokenizer
	model_name = "facebook/mbart-large-cc25"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Preprocessing function
	def tokenized_function(examples):
	tokenized_datasets = dataset.map(preprocess_function, batched=True)
	return tokenized_datasets

	def tokenize_fn(examples):
	inputs = [ex for ex in examples['en']]
	targets = [ex for ex in examples['te']]
	model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
	labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
	model_inputs["labels"] = labels
	return model_inputs

	tokenized_dataset = dataset.map(tokenize_fn, batched=True)

	from transformers import AutoTokenizer
	from datasets import load_dataset
	from sklearn.model_selection import train_test_split
	from datasets import DatasetDict , Dataset # Import DatasetDict here
	import pandas as pd

	# Load the dataset
	file_path = "Telugu.csv"
	df = pd.read_csv(file_path)
	dataset = load_dataset('csv', data_files='Telugu.csv')

	# Create train and test splits
	train_data, test_data = train_test_split(df, test_size=0.2, random_state=42) # Adjust test_size and random_state as needed

	# Convert the split data back to Dataset objects
	train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
	test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))

	# Update dataset with train and test splits
	dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

	# ... (Rest of your code remains the same)

	model_name = "facebook/mbart-large-cc25"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Tokenization function
	def tokenize_fn(examples):
	inputs = [ex for ex in examples['en']]
	targets = [ex for ex in examples['te']]
	model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
	labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
	model_inputs["labels"] = labels
	return model_inputs

	# Apply tokenization to train and test datasets separately
	tokenized_dataset = DatasetDict({
	'train': train_dataset.map(tokenize_fn, batched=True),
	'test': test_dataset.map(tokenize_fn, batched=True)
	})

	# ... (Rest of your code remains the same)

	from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM

	# Load the pre-trained model for sequence-to-sequence tasks
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	training_args = TrainingArguments(
	output_dir='./results', # Output directory
	num_train_epochs=3, # Number of training epochs
	per_device_train_batch_size=16, # Batch size for training
	per_device_eval_batch_size=16, # Batch size for evaluation
	warmup_steps=500, # Number of warmup steps for learning rate scheduler
	weight_decay=0.01, # Strength of weight decay
	logging_dir='./logs', # Directory for storing logs
	evaluation_strategy="epoch", # Evaluate at the end of each epoch
	save_strategy="epoch", # Save the model at the end of each epoch
	load_best_model_at_end=True, # Load the best model at the end of training
	metric_for_best_model="eval_loss", # Use evaluation loss to determine the best model
	)

	training_args = TrainingArguments(
	output_dir='./results',
	num_train_epochs=3,
	per_device_train_batch_size=16,
	per_device_eval_batch_size=16,
	warmup_steps=500,
	weight_decay=0.01,
	logging_dir='./logs',
	eval_strategy="epoch", # Changed to eval_strategy
	save_strategy="epoch",
	load_best_model_at_end=True,
	metric_for_best_model="eval_loss",
	push_to_hub=True,
	hub_model_id="jaksani/Englishtranslator"
	)

	from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
	model_name = "facebook/mbart-large-cc25"
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Define the model here

	trainer = Trainer(
	model=model, # The initialized model
	args=training_args,
	train_dataset=tokenized_dataset['train'],# Training arguments
	eval_dataset=tokenized_dataset['test'], # Evaluation dataset
	)

	trainer.train()

	model.save_pretrained('./fine-tuned-model')