Englishtranslater / copy_of_train_py.py
jaksani's picture
Upload copy_of_train_py.py
f89a731 verified
# -*- coding: utf-8 -*-
"""Copy of Train.py
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1kmBG6E2hojULw9nZo3wPcAEzWDtB2axF
"""
!pip install transformers datasets torch huggingface_hub
import pandas as pd
from datasets import Dataset
# Load dataset
df = pd.read_csv('Telugu.csv') # Replace 'dataset.csv' with your file name
dataset = Dataset.from_pandas(df)
# Display dataset
print(dataset)
import pandas as pd
# Load the dataset
file_path = "Telugu.csv" # Use the file name of your uploaded dataset
df = pd.read_csv(file_path)
# Remove duplicates
df = df.drop_duplicates()
# Remove rows with missing values
df = df.dropna()
# Preview the cleaned dataset
print("Dataset after removing duplicates and null values:")
print(df.head())
# Save the cleaned dataset
cleaned_file_name = "cleaned_telugu.csv"
df.to_csv(cleaned_file_name, index=False)
print(f"Cleaned dataset saved as {cleaned_file_name}")
from huggingface_hub import notebook_login
notebook_login()
from transformers import AutoTokenizer
from datasets import load_dataset
# Load the dataset
dataset = load_dataset('csv', data_files='Telugu.csv')
# Create train and test splits (if needed)
# ... (Your existing code for splitting) ...
# Load the tokenizer
model_name = "facebook/mbart-large-cc25"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Preprocessing function
def tokenized_function(examples):
tokenized_datasets = dataset.map(preprocess_function, batched=True)
return tokenized_datasets
def tokenize_fn(examples):
inputs = [ex for ex in examples['en']]
targets = [ex for ex in examples['te']]
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
model_inputs["labels"] = labels
return model_inputs
tokenized_dataset = dataset.map(tokenize_fn, batched=True)
from transformers import AutoTokenizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from datasets import DatasetDict , Dataset # Import DatasetDict here
import pandas as pd
# Load the dataset
file_path = "Telugu.csv"
df = pd.read_csv(file_path)
dataset = load_dataset('csv', data_files='Telugu.csv')
# Create train and test splits
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42) # Adjust test_size and random_state as needed
# Convert the split data back to Dataset objects
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))
# Update dataset with train and test splits
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
# ... (Rest of your code remains the same)
model_name = "facebook/mbart-large-cc25"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Tokenization function
def tokenize_fn(examples):
inputs = [ex for ex in examples['en']]
targets = [ex for ex in examples['te']]
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
model_inputs["labels"] = labels
return model_inputs
# Apply tokenization to train and test datasets separately
tokenized_dataset = DatasetDict({
'train': train_dataset.map(tokenize_fn, batched=True),
'test': test_dataset.map(tokenize_fn, batched=True)
})
# ... (Rest of your code remains the same)
from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
# Load the pre-trained model for sequence-to-sequence tasks
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
training_args = TrainingArguments(
output_dir='./results', # Output directory
num_train_epochs=3, # Number of training epochs
per_device_train_batch_size=16, # Batch size for training
per_device_eval_batch_size=16, # Batch size for evaluation
warmup_steps=500, # Number of warmup steps for learning rate scheduler
weight_decay=0.01, # Strength of weight decay
logging_dir='./logs', # Directory for storing logs
evaluation_strategy="epoch", # Evaluate at the end of each epoch
save_strategy="epoch", # Save the model at the end of each epoch
load_best_model_at_end=True, # Load the best model at the end of training
metric_for_best_model="eval_loss", # Use evaluation loss to determine the best model
)
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
eval_strategy="epoch", # Changed to eval_strategy
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
push_to_hub=True,
hub_model_id="jaksani/Englishtranslator"
)
from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
model_name = "facebook/mbart-large-cc25"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Define the model here
trainer = Trainer(
model=model, # The initialized model
args=training_args,
train_dataset=tokenized_dataset['train'],# Training arguments
eval_dataset=tokenized_dataset['test'], # Evaluation dataset
)
trainer.train()
model.save_pretrained('./fine-tuned-model')