nashit93 commited on
Commit
e3937dd
·
1 Parent(s): eb9d5ad

First commit with model

Browse files
Retraining_pipeline.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import os
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+ from datasets import Dataset
6
+ from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq
7
+ from tensorflow.keras.optimizers import Adam
8
+
9
+ # Flag to control the training
10
+ train_flag = False # Set this to False if you don't want to train
11
+
12
+ # Check for an existing model checkpoint
13
+ checkpoint_dir = 'model_checkpoints'
14
+ MAX_LENGTH = 2000
15
+ # Set up tokenizer and model
16
+ tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
17
+
18
+ # Custom callback for saving the model
19
+ class CustomModelCheckpoint(tf.keras.callbacks.Callback):
20
+ def __init__(self, checkpoint_dir, save_freq=1):
21
+ super(CustomModelCheckpoint, self).__init__()
22
+ self.checkpoint_dir = checkpoint_dir
23
+ self.save_freq = save_freq
24
+
25
+ def on_epoch_end(self, epoch, logs=None):
26
+ if (epoch + 1) % self.save_freq == 0:
27
+ path = os.path.join(self.checkpoint_dir, f"checkpoint-{epoch + 1}")
28
+ if not os.path.exists(path):
29
+ os.makedirs(path)
30
+ self.model.save_pretrained(path)
31
+
32
+ # Load or initialize model
33
+ latest_checkpoint = None
34
+ if os.path.exists(checkpoint_dir):
35
+ checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
36
+ checkpoints = [d for d in checkpoints if os.path.isdir(d)]
37
+ if checkpoints:
38
+ latest_checkpoint = max(checkpoints, key=os.path.getmtime)
39
+
40
+ if latest_checkpoint:
41
+ print("Resuming...")
42
+ model = TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
43
+ else:
44
+ model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
45
+
46
+ # # Load your data
47
+ # file_path1 = 'train.en'
48
+ # file_path2 = 'train.hi'
49
+ # lines1, lines2 = [], []
50
+
51
+ # # Read data from files
52
+ # with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
53
+ # for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
54
+ # try:
55
+ # line1_clean = line1.strip()
56
+ # line2_clean = line2.strip()
57
+ # lines1.append(line1_clean)
58
+ # lines2.append(line2_clean)
59
+ # except Exception as e:
60
+ # continue
61
+
62
+ # # Create DataFrame
63
+ # df = pd.DataFrame({
64
+ # 'Text': lines1,
65
+ # 'Expected': lines2
66
+ # })
67
+ # df = df.reset_index(drop=True)
68
+ # df = df.sample(frac=0.1)
69
+
70
+ df = pd.read_pickle("srt_scapper/Quran/fin_data.nas")
71
+ df.columns = ["Text","Expected"]
72
+ # Convert DataFrame to Hugging Face dataset format
73
+ dataset = Dataset.from_pandas(df)
74
+ dataset = dataset.shuffle(seed=42)
75
+
76
+ # Tokenization and data preparation with increased max_length
77
+ def preprocess_function(examples):
78
+ # Increased max_length to 2000 to handle longer sentences
79
+ padding = "max_length"
80
+ max_length = 2000 # Increase max_length here
81
+ inputs = examples["Text"]
82
+ targets = examples["Expected"]
83
+ model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
84
+ labels = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)
85
+ model_inputs["labels"] = labels["input_ids"]
86
+ return model_inputs
87
+
88
+ train_dataset = dataset.map(preprocess_function, batched=True, desc="Running tokenizer")
89
+
90
+ data_collator = DataCollatorForSeq2Seq(
91
+ tokenizer,
92
+ model=model,
93
+ label_pad_token_id=tokenizer.pad_token_id,
94
+ pad_to_multiple_of=64,
95
+ return_tensors="tf"
96
+ )
97
+
98
+ # Prepare dataset for training
99
+ tf_train_dataset = model.prepare_tf_dataset(
100
+ train_dataset,
101
+ collate_fn=data_collator,
102
+ batch_size=4, # Consider reducing batch size if memory issues occur
103
+ shuffle=True
104
+ )
105
+
106
+ # Compile the model
107
+ model.compile(optimizer=Adam(3e-5))
108
+
109
+ # Callbacks for training
110
+ early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
111
+
112
+ # Ensure the checkpoint directory exists
113
+ if not os.path.exists(checkpoint_dir):
114
+ os.makedirs(checkpoint_dir)
115
+
116
+ # Custom model checkpoint
117
+ model_checkpoint = CustomModelCheckpoint(
118
+ checkpoint_dir,
119
+ save_freq=1 # Save after every epoch
120
+ )
121
+
122
+ # Fit the model
123
+ model.fit(
124
+ tf_train_dataset,
125
+ epochs=10,
126
+ callbacks=[early_stopping, model_checkpoint]
127
+ )
128
+
129
+ # Saving the final model (optional)
130
+ model.save_pretrained(os.path.join(checkpoint_dir, 'final_model'))
model_checkpoints-small-on-1mill-dp/final_model/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-small",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 1024,
8
+ "d_kv": 64,
9
+ "d_model": 512,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "mt5",
20
+ "num_decoder_layers": 8,
21
+ "num_heads": 6,
22
+ "num_layers": 8,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "tokenizer_class": "T5Tokenizer",
28
+ "transformers_version": "4.38.2",
29
+ "use_cache": true,
30
+ "vocab_size": 250112
31
+ }
model_checkpoints-small-on-1mill-dp/final_model/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.38.2"
7
+ }
model_checkpoints-small-on-1mill-dp/final_model/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80e4a4809a5644608a697c12a61d9ad480d1ee1ac29e16b4cfe48bc44da71793
3
+ size 2225556280
prediction_pipeline.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ import tensorflow as tf
5
+ import os
6
+ from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer
7
+ import pandas as pd
8
+ from datasets import Dataset
9
+ from tqdm import tqdm
10
+ # Set up tokenizer
11
+ tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
12
+
13
+
14
+
15
+ # # Load your data
16
+ # file_path1 = 'train.en'
17
+ # file_path2 = 'train.hi'
18
+ # lines1, lines2 = [], []
19
+
20
+ # # Read data from files
21
+ # with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
22
+ # for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
23
+ # try:
24
+ # line1_clean = line1.strip()
25
+ # line2_clean = line2.strip()
26
+ # lines1.append(line1_clean)
27
+ # lines2.append(line2_clean)
28
+ # except Exception as e:
29
+ # continue
30
+
31
+ # # Create DataFrame
32
+ # df = pd.DataFrame({
33
+ # 'Text': lines1,
34
+ # 'Expected': lines2
35
+ # })
36
+ # df = df.reset_index(drop=True)
37
+ # df = df.sample(frac=0.1)
38
+ # # Convert DataFrame to Hugging Face dataset format
39
+ # dataset = Dataset.from_pandas(df)
40
+ # dataset = dataset.shuffle(seed=42)
41
+
42
+
43
+
44
+ # Function to load the model from the latest checkpoint
45
+ def load_model(checkpoint_dir):
46
+ latest_checkpoint = None
47
+ if os.path.exists(checkpoint_dir):
48
+ checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
49
+ checkpoints = [d for d in checkpoints if os.path.isdir(d)]
50
+ if checkpoints:
51
+ latest_checkpoint = max(checkpoints, key=os.path.getmtime)
52
+
53
+ if latest_checkpoint:
54
+ print("Loading model from:", latest_checkpoint)
55
+ return TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
56
+ else:
57
+ print("No checkpoint found, loading default model")
58
+ return TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
59
+
60
+ # Load the model
61
+ model = load_model('model_checkpoints-small-on-1mill-dp')
62
+
63
+ # Function to prepare text for prediction
64
+ def prepare_text(text, tokenizer, max_length=200):
65
+ inputs = tokenizer.encode(text, return_tensors="tf", max_length=max_length, truncation=True)
66
+ return inputs
67
+
68
+ # Function to generate prediction with adjustable settings
69
+ def generate_prediction(text, model, tokenizer, max_length=2000, num_beams=5):
70
+ input_ids = prepare_text(text, tokenizer, max_length=max_length)
71
+ output_ids = model.generate(
72
+ input_ids,
73
+ max_length=max_length,
74
+ num_beams=num_beams,
75
+ no_repeat_ngram_size=2,
76
+ early_stopping=True
77
+ )
78
+ return tokenizer.decode(output_ids[0], skip_special_tokens=True)
79
+
80
+
81
+ # Example Prediction
82
+ sample_text = "Hi , how are you?"
83
+ # sample_text = "Guide us to the straight path"
84
+ prediction = generate_prediction(sample_text, model, tokenizer)
85
+ print("Prediction:", prediction)