| |
| """Copy of Train.py |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1kmBG6E2hojULw9nZo3wPcAEzWDtB2axF |
| """ |
|
|
| !pip install transformers datasets torch huggingface_hub |
|
|
| import pandas as pd |
| from datasets import Dataset |
|
|
| |
| df = pd.read_csv('Telugu.csv') |
| dataset = Dataset.from_pandas(df) |
|
|
| |
| print(dataset) |
|
|
| import pandas as pd |
|
|
| |
| file_path = "Telugu.csv" |
| df = pd.read_csv(file_path) |
|
|
| |
| df = df.drop_duplicates() |
|
|
| |
| df = df.dropna() |
|
|
| |
| print("Dataset after removing duplicates and null values:") |
| print(df.head()) |
|
|
| |
| cleaned_file_name = "cleaned_telugu.csv" |
| df.to_csv(cleaned_file_name, index=False) |
| print(f"Cleaned dataset saved as {cleaned_file_name}") |
|
|
| from huggingface_hub import notebook_login |
|
|
| notebook_login() |
|
|
| from transformers import AutoTokenizer |
| from datasets import load_dataset |
|
|
| |
| dataset = load_dataset('csv', data_files='Telugu.csv') |
|
|
| |
| |
|
|
| |
| model_name = "facebook/mbart-large-cc25" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| |
| def tokenized_function(examples): |
| tokenized_datasets = dataset.map(preprocess_function, batched=True) |
| return tokenized_datasets |
|
|
| def tokenize_fn(examples): |
| inputs = [ex for ex in examples['en']] |
| targets = [ex for ex in examples['te']] |
| model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") |
| labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids |
| model_inputs["labels"] = labels |
| return model_inputs |
|
|
| tokenized_dataset = dataset.map(tokenize_fn, batched=True) |
|
|
| from transformers import AutoTokenizer |
| from datasets import load_dataset |
| from sklearn.model_selection import train_test_split |
| from datasets import DatasetDict , Dataset |
| import pandas as pd |
|
|
| |
| file_path = "Telugu.csv" |
| df = pd.read_csv(file_path) |
| dataset = load_dataset('csv', data_files='Telugu.csv') |
|
|
| |
| train_data, test_data = train_test_split(df, test_size=0.2, random_state=42) |
|
|
| |
| train_dataset = Dataset.from_pandas(pd.DataFrame(train_data)) |
| test_dataset = Dataset.from_pandas(pd.DataFrame(test_data)) |
|
|
| |
| dataset = DatasetDict({"train": train_dataset, "test": test_dataset}) |
|
|
| |
|
|
| model_name = "facebook/mbart-large-cc25" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| |
| def tokenize_fn(examples): |
| inputs = [ex for ex in examples['en']] |
| targets = [ex for ex in examples['te']] |
| model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") |
| labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids |
| model_inputs["labels"] = labels |
| return model_inputs |
|
|
| |
| tokenized_dataset = DatasetDict({ |
| 'train': train_dataset.map(tokenize_fn, batched=True), |
| 'test': test_dataset.map(tokenize_fn, batched=True) |
| }) |
|
|
| |
|
|
| from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM |
|
|
| |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
| training_args = TrainingArguments( |
| output_dir='./results', |
| num_train_epochs=3, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=16, |
| warmup_steps=500, |
| weight_decay=0.01, |
| logging_dir='./logs', |
| evaluation_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| metric_for_best_model="eval_loss", |
| ) |
|
|
| training_args = TrainingArguments( |
| output_dir='./results', |
| num_train_epochs=3, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=16, |
| warmup_steps=500, |
| weight_decay=0.01, |
| logging_dir='./logs', |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| metric_for_best_model="eval_loss", |
| push_to_hub=True, |
| hub_model_id="jaksani/Englishtranslator" |
| ) |
|
|
| from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM |
| model_name = "facebook/mbart-large-cc25" |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset['train'], |
| eval_dataset=tokenized_dataset['test'], |
| ) |
|
|
| trainer.train() |
|
|
| model.save_pretrained('./fine-tuned-model') |