jaksani
/

Englishtranslater

+# -*- coding: utf-8 -*-
+"""Copy of Train.py
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1kmBG6E2hojULw9nZo3wPcAEzWDtB2axF
+"""
+!pip install transformers datasets torch huggingface_hub
+import pandas as pd
+from datasets import Dataset
+# Load dataset
+df = pd.read_csv('Telugu.csv')  # Replace 'dataset.csv' with your file name
+dataset = Dataset.from_pandas(df)
+# Display dataset
+print(dataset)
+import pandas as pd
+# Load the dataset
+file_path = "Telugu.csv"  # Use the file name of your uploaded dataset
+df = pd.read_csv(file_path)
+# Remove duplicates
+df = df.drop_duplicates()
+# Remove rows with missing values
+df = df.dropna()
+# Preview the cleaned dataset
+print("Dataset after removing duplicates and null values:")
+print(df.head())
+# Save the cleaned dataset
+cleaned_file_name = "cleaned_telugu.csv"
+df.to_csv(cleaned_file_name, index=False)
+print(f"Cleaned dataset saved as {cleaned_file_name}")
+from huggingface_hub import notebook_login
+notebook_login()
+from transformers import AutoTokenizer
+from datasets import load_dataset
+# Load the dataset
+dataset = load_dataset('csv', data_files='Telugu.csv')
+# Create train and test splits (if needed)
+# ... (Your existing code for splitting) ...
+# Load the tokenizer
+model_name = "facebook/mbart-large-cc25"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Preprocessing function
+def tokenized_function(examples):
+  tokenized_datasets = dataset.map(preprocess_function, batched=True)
+  return tokenized_datasets
+def tokenize_fn(examples):
+    inputs = [ex for ex in examples['en']]
+    targets = [ex for ex in examples['te']]
+    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
+    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
+    model_inputs["labels"] = labels
+    return model_inputs
+tokenized_dataset = dataset.map(tokenize_fn, batched=True)
+from transformers import AutoTokenizer
+from datasets import load_dataset
+from sklearn.model_selection import train_test_split
+from datasets import DatasetDict , Dataset # Import DatasetDict here
+import pandas as pd
+# Load the dataset
+file_path = "Telugu.csv"
+df = pd.read_csv(file_path)
+dataset = load_dataset('csv', data_files='Telugu.csv')
+# Create train and test splits
+train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed
+# Convert the split data back to Dataset objects
+train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
+test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))
+# Update dataset with train and test splits
+dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
+# ... (Rest of your code remains the same)
+model_name = "facebook/mbart-large-cc25"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Tokenization function
+def tokenize_fn(examples):
+    inputs = [ex for ex in examples['en']]
+    targets = [ex for ex in examples['te']]
+    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
+    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
+    model_inputs["labels"] = labels
+    return model_inputs
+# Apply tokenization to train and test datasets separately
+tokenized_dataset = DatasetDict({
+    'train': train_dataset.map(tokenize_fn, batched=True),
+    'test': test_dataset.map(tokenize_fn, batched=True)
+})
+# ... (Rest of your code remains the same)
+from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
+# Load the pre-trained model for sequence-to-sequence tasks
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+training_args = TrainingArguments(
+    output_dir='./results',          # Output directory
+    num_train_epochs=3,              # Number of training epochs
+    per_device_train_batch_size=16,  # Batch size for training
+    per_device_eval_batch_size=16,   # Batch size for evaluation
+    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
+    weight_decay=0.01,               # Strength of weight decay
+    logging_dir='./logs',            # Directory for storing logs
+    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
+    save_strategy="epoch",           # Save the model at the end of each epoch
+    load_best_model_at_end=True,     # Load the best model at the end of training
+    metric_for_best_model="eval_loss", # Use evaluation loss to determine the best model
+)
+training_args = TrainingArguments(
+    output_dir='./results',
+    num_train_epochs=3,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    warmup_steps=500,
+    weight_decay=0.01,
+    logging_dir='./logs',
+    eval_strategy="epoch",  # Changed to eval_strategy
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    metric_for_best_model="eval_loss",
+    push_to_hub=True,
+    hub_model_id="jaksani/Englishtranslator"
+)
+from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
+model_name = "facebook/mbart-large-cc25"
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)  # Define the model here
+trainer = Trainer(
+    model=model,                     # The initialized model
+    args=training_args,
+    train_dataset=tokenized_dataset['train'],# Training arguments
+    eval_dataset=tokenized_dataset['test'],    # Evaluation dataset
+)
+trainer.train()
+model.save_pretrained('./fine-tuned-model')