jaksani commited on
Commit
f89a731
·
verified ·
1 Parent(s): 1434318

Upload copy_of_train_py.py

Browse files
Files changed (1) hide show
  1. copy_of_train_py.py +166 -0
copy_of_train_py.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of Train.py
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1kmBG6E2hojULw9nZo3wPcAEzWDtB2axF
8
+ """
9
+
10
+ !pip install transformers datasets torch huggingface_hub
11
+
12
+ import pandas as pd
13
+ from datasets import Dataset
14
+
15
+ # Load dataset
16
+ df = pd.read_csv('Telugu.csv') # Replace 'dataset.csv' with your file name
17
+ dataset = Dataset.from_pandas(df)
18
+
19
+ # Display dataset
20
+ print(dataset)
21
+
22
+ import pandas as pd
23
+
24
+ # Load the dataset
25
+ file_path = "Telugu.csv" # Use the file name of your uploaded dataset
26
+ df = pd.read_csv(file_path)
27
+
28
+ # Remove duplicates
29
+ df = df.drop_duplicates()
30
+
31
+ # Remove rows with missing values
32
+ df = df.dropna()
33
+
34
+ # Preview the cleaned dataset
35
+ print("Dataset after removing duplicates and null values:")
36
+ print(df.head())
37
+
38
+ # Save the cleaned dataset
39
+ cleaned_file_name = "cleaned_telugu.csv"
40
+ df.to_csv(cleaned_file_name, index=False)
41
+ print(f"Cleaned dataset saved as {cleaned_file_name}")
42
+
43
+ from huggingface_hub import notebook_login
44
+
45
+ notebook_login()
46
+
47
+ from transformers import AutoTokenizer
48
+ from datasets import load_dataset
49
+
50
+ # Load the dataset
51
+ dataset = load_dataset('csv', data_files='Telugu.csv')
52
+
53
+ # Create train and test splits (if needed)
54
+ # ... (Your existing code for splitting) ...
55
+
56
+ # Load the tokenizer
57
+ model_name = "facebook/mbart-large-cc25"
58
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
59
+
60
+ # Preprocessing function
61
+ def tokenized_function(examples):
62
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
63
+ return tokenized_datasets
64
+
65
+ def tokenize_fn(examples):
66
+ inputs = [ex for ex in examples['en']]
67
+ targets = [ex for ex in examples['te']]
68
+ model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
69
+ labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
70
+ model_inputs["labels"] = labels
71
+ return model_inputs
72
+
73
+ tokenized_dataset = dataset.map(tokenize_fn, batched=True)
74
+
75
+ from transformers import AutoTokenizer
76
+ from datasets import load_dataset
77
+ from sklearn.model_selection import train_test_split
78
+ from datasets import DatasetDict , Dataset # Import DatasetDict here
79
+ import pandas as pd
80
+
81
+ # Load the dataset
82
+ file_path = "Telugu.csv"
83
+ df = pd.read_csv(file_path)
84
+ dataset = load_dataset('csv', data_files='Telugu.csv')
85
+
86
+ # Create train and test splits
87
+ train_data, test_data = train_test_split(df, test_size=0.2, random_state=42) # Adjust test_size and random_state as needed
88
+
89
+ # Convert the split data back to Dataset objects
90
+ train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
91
+ test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))
92
+
93
+ # Update dataset with train and test splits
94
+ dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
95
+
96
+ # ... (Rest of your code remains the same)
97
+
98
+ model_name = "facebook/mbart-large-cc25"
99
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
100
+
101
+ # Tokenization function
102
+ def tokenize_fn(examples):
103
+ inputs = [ex for ex in examples['en']]
104
+ targets = [ex for ex in examples['te']]
105
+ model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
106
+ labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
107
+ model_inputs["labels"] = labels
108
+ return model_inputs
109
+
110
+ # Apply tokenization to train and test datasets separately
111
+ tokenized_dataset = DatasetDict({
112
+ 'train': train_dataset.map(tokenize_fn, batched=True),
113
+ 'test': test_dataset.map(tokenize_fn, batched=True)
114
+ })
115
+
116
+ # ... (Rest of your code remains the same)
117
+
118
+ from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
119
+
120
+ # Load the pre-trained model for sequence-to-sequence tasks
121
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
122
+
123
+ training_args = TrainingArguments(
124
+ output_dir='./results', # Output directory
125
+ num_train_epochs=3, # Number of training epochs
126
+ per_device_train_batch_size=16, # Batch size for training
127
+ per_device_eval_batch_size=16, # Batch size for evaluation
128
+ warmup_steps=500, # Number of warmup steps for learning rate scheduler
129
+ weight_decay=0.01, # Strength of weight decay
130
+ logging_dir='./logs', # Directory for storing logs
131
+ evaluation_strategy="epoch", # Evaluate at the end of each epoch
132
+ save_strategy="epoch", # Save the model at the end of each epoch
133
+ load_best_model_at_end=True, # Load the best model at the end of training
134
+ metric_for_best_model="eval_loss", # Use evaluation loss to determine the best model
135
+ )
136
+
137
+ training_args = TrainingArguments(
138
+ output_dir='./results',
139
+ num_train_epochs=3,
140
+ per_device_train_batch_size=16,
141
+ per_device_eval_batch_size=16,
142
+ warmup_steps=500,
143
+ weight_decay=0.01,
144
+ logging_dir='./logs',
145
+ eval_strategy="epoch", # Changed to eval_strategy
146
+ save_strategy="epoch",
147
+ load_best_model_at_end=True,
148
+ metric_for_best_model="eval_loss",
149
+ push_to_hub=True,
150
+ hub_model_id="jaksani/Englishtranslator"
151
+ )
152
+
153
+ from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
154
+ model_name = "facebook/mbart-large-cc25"
155
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Define the model here
156
+
157
+ trainer = Trainer(
158
+ model=model, # The initialized model
159
+ args=training_args,
160
+ train_dataset=tokenized_dataset['train'],# Training arguments
161
+ eval_dataset=tokenized_dataset['test'], # Evaluation dataset
162
+ )
163
+
164
+ trainer.train()
165
+
166
+ model.save_pretrained('./fine-tuned-model')