First commit with model
Browse files
Retraining_pipeline.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from datasets import Dataset
|
| 6 |
+
from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq
|
| 7 |
+
from tensorflow.keras.optimizers import Adam
|
| 8 |
+
|
| 9 |
+
# Flag to control the training
|
| 10 |
+
train_flag = False # Set this to False if you don't want to train
|
| 11 |
+
|
| 12 |
+
# Check for an existing model checkpoint
|
| 13 |
+
checkpoint_dir = 'model_checkpoints'
|
| 14 |
+
MAX_LENGTH = 2000
|
| 15 |
+
# Set up tokenizer and model
|
| 16 |
+
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
|
| 17 |
+
|
| 18 |
+
# Custom callback for saving the model
|
| 19 |
+
class CustomModelCheckpoint(tf.keras.callbacks.Callback):
|
| 20 |
+
def __init__(self, checkpoint_dir, save_freq=1):
|
| 21 |
+
super(CustomModelCheckpoint, self).__init__()
|
| 22 |
+
self.checkpoint_dir = checkpoint_dir
|
| 23 |
+
self.save_freq = save_freq
|
| 24 |
+
|
| 25 |
+
def on_epoch_end(self, epoch, logs=None):
|
| 26 |
+
if (epoch + 1) % self.save_freq == 0:
|
| 27 |
+
path = os.path.join(self.checkpoint_dir, f"checkpoint-{epoch + 1}")
|
| 28 |
+
if not os.path.exists(path):
|
| 29 |
+
os.makedirs(path)
|
| 30 |
+
self.model.save_pretrained(path)
|
| 31 |
+
|
| 32 |
+
# Load or initialize model
|
| 33 |
+
latest_checkpoint = None
|
| 34 |
+
if os.path.exists(checkpoint_dir):
|
| 35 |
+
checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
|
| 36 |
+
checkpoints = [d for d in checkpoints if os.path.isdir(d)]
|
| 37 |
+
if checkpoints:
|
| 38 |
+
latest_checkpoint = max(checkpoints, key=os.path.getmtime)
|
| 39 |
+
|
| 40 |
+
if latest_checkpoint:
|
| 41 |
+
print("Resuming...")
|
| 42 |
+
model = TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
|
| 43 |
+
else:
|
| 44 |
+
model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
|
| 45 |
+
|
| 46 |
+
# # Load your data
|
| 47 |
+
# file_path1 = 'train.en'
|
| 48 |
+
# file_path2 = 'train.hi'
|
| 49 |
+
# lines1, lines2 = [], []
|
| 50 |
+
|
| 51 |
+
# # Read data from files
|
| 52 |
+
# with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
|
| 53 |
+
# for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
|
| 54 |
+
# try:
|
| 55 |
+
# line1_clean = line1.strip()
|
| 56 |
+
# line2_clean = line2.strip()
|
| 57 |
+
# lines1.append(line1_clean)
|
| 58 |
+
# lines2.append(line2_clean)
|
| 59 |
+
# except Exception as e:
|
| 60 |
+
# continue
|
| 61 |
+
|
| 62 |
+
# # Create DataFrame
|
| 63 |
+
# df = pd.DataFrame({
|
| 64 |
+
# 'Text': lines1,
|
| 65 |
+
# 'Expected': lines2
|
| 66 |
+
# })
|
| 67 |
+
# df = df.reset_index(drop=True)
|
| 68 |
+
# df = df.sample(frac=0.1)
|
| 69 |
+
|
| 70 |
+
df = pd.read_pickle("srt_scapper/Quran/fin_data.nas")
|
| 71 |
+
df.columns = ["Text","Expected"]
|
| 72 |
+
# Convert DataFrame to Hugging Face dataset format
|
| 73 |
+
dataset = Dataset.from_pandas(df)
|
| 74 |
+
dataset = dataset.shuffle(seed=42)
|
| 75 |
+
|
| 76 |
+
# Tokenization and data preparation with increased max_length
|
| 77 |
+
def preprocess_function(examples):
|
| 78 |
+
# Increased max_length to 2000 to handle longer sentences
|
| 79 |
+
padding = "max_length"
|
| 80 |
+
max_length = 2000 # Increase max_length here
|
| 81 |
+
inputs = examples["Text"]
|
| 82 |
+
targets = examples["Expected"]
|
| 83 |
+
model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
|
| 84 |
+
labels = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)
|
| 85 |
+
model_inputs["labels"] = labels["input_ids"]
|
| 86 |
+
return model_inputs
|
| 87 |
+
|
| 88 |
+
train_dataset = dataset.map(preprocess_function, batched=True, desc="Running tokenizer")
|
| 89 |
+
|
| 90 |
+
data_collator = DataCollatorForSeq2Seq(
|
| 91 |
+
tokenizer,
|
| 92 |
+
model=model,
|
| 93 |
+
label_pad_token_id=tokenizer.pad_token_id,
|
| 94 |
+
pad_to_multiple_of=64,
|
| 95 |
+
return_tensors="tf"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Prepare dataset for training
|
| 99 |
+
tf_train_dataset = model.prepare_tf_dataset(
|
| 100 |
+
train_dataset,
|
| 101 |
+
collate_fn=data_collator,
|
| 102 |
+
batch_size=4, # Consider reducing batch size if memory issues occur
|
| 103 |
+
shuffle=True
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Compile the model
|
| 107 |
+
model.compile(optimizer=Adam(3e-5))
|
| 108 |
+
|
| 109 |
+
# Callbacks for training
|
| 110 |
+
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
|
| 111 |
+
|
| 112 |
+
# Ensure the checkpoint directory exists
|
| 113 |
+
if not os.path.exists(checkpoint_dir):
|
| 114 |
+
os.makedirs(checkpoint_dir)
|
| 115 |
+
|
| 116 |
+
# Custom model checkpoint
|
| 117 |
+
model_checkpoint = CustomModelCheckpoint(
|
| 118 |
+
checkpoint_dir,
|
| 119 |
+
save_freq=1 # Save after every epoch
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Fit the model
|
| 123 |
+
model.fit(
|
| 124 |
+
tf_train_dataset,
|
| 125 |
+
epochs=10,
|
| 126 |
+
callbacks=[early_stopping, model_checkpoint]
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Saving the final model (optional)
|
| 130 |
+
model.save_pretrained(os.path.join(checkpoint_dir, 'final_model'))
|
model_checkpoints-small-on-1mill-dp/final_model/config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "google/mt5-small",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"MT5ForConditionalGeneration"
|
| 5 |
+
],
|
| 6 |
+
"classifier_dropout": 0.0,
|
| 7 |
+
"d_ff": 1024,
|
| 8 |
+
"d_kv": 64,
|
| 9 |
+
"d_model": 512,
|
| 10 |
+
"decoder_start_token_id": 0,
|
| 11 |
+
"dense_act_fn": "gelu_new",
|
| 12 |
+
"dropout_rate": 0.1,
|
| 13 |
+
"eos_token_id": 1,
|
| 14 |
+
"feed_forward_proj": "gated-gelu",
|
| 15 |
+
"initializer_factor": 1.0,
|
| 16 |
+
"is_encoder_decoder": true,
|
| 17 |
+
"is_gated_act": true,
|
| 18 |
+
"layer_norm_epsilon": 1e-06,
|
| 19 |
+
"model_type": "mt5",
|
| 20 |
+
"num_decoder_layers": 8,
|
| 21 |
+
"num_heads": 6,
|
| 22 |
+
"num_layers": 8,
|
| 23 |
+
"pad_token_id": 0,
|
| 24 |
+
"relative_attention_max_distance": 128,
|
| 25 |
+
"relative_attention_num_buckets": 32,
|
| 26 |
+
"tie_word_embeddings": false,
|
| 27 |
+
"tokenizer_class": "T5Tokenizer",
|
| 28 |
+
"transformers_version": "4.38.2",
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"vocab_size": 250112
|
| 31 |
+
}
|
model_checkpoints-small-on-1mill-dp/final_model/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"decoder_start_token_id": 0,
|
| 4 |
+
"eos_token_id": 1,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.38.2"
|
| 7 |
+
}
|
model_checkpoints-small-on-1mill-dp/final_model/tf_model.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80e4a4809a5644608a697c12a61d9ad480d1ee1ac29e16b4cfe48bc44da71793
|
| 3 |
+
size 2225556280
|
prediction_pipeline.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# coding: utf-8
|
| 3 |
+
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
import os
|
| 6 |
+
from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from datasets import Dataset
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
# Set up tokenizer
|
| 11 |
+
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# # Load your data
|
| 16 |
+
# file_path1 = 'train.en'
|
| 17 |
+
# file_path2 = 'train.hi'
|
| 18 |
+
# lines1, lines2 = [], []
|
| 19 |
+
|
| 20 |
+
# # Read data from files
|
| 21 |
+
# with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
|
| 22 |
+
# for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
|
| 23 |
+
# try:
|
| 24 |
+
# line1_clean = line1.strip()
|
| 25 |
+
# line2_clean = line2.strip()
|
| 26 |
+
# lines1.append(line1_clean)
|
| 27 |
+
# lines2.append(line2_clean)
|
| 28 |
+
# except Exception as e:
|
| 29 |
+
# continue
|
| 30 |
+
|
| 31 |
+
# # Create DataFrame
|
| 32 |
+
# df = pd.DataFrame({
|
| 33 |
+
# 'Text': lines1,
|
| 34 |
+
# 'Expected': lines2
|
| 35 |
+
# })
|
| 36 |
+
# df = df.reset_index(drop=True)
|
| 37 |
+
# df = df.sample(frac=0.1)
|
| 38 |
+
# # Convert DataFrame to Hugging Face dataset format
|
| 39 |
+
# dataset = Dataset.from_pandas(df)
|
| 40 |
+
# dataset = dataset.shuffle(seed=42)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Function to load the model from the latest checkpoint
|
| 45 |
+
def load_model(checkpoint_dir):
|
| 46 |
+
latest_checkpoint = None
|
| 47 |
+
if os.path.exists(checkpoint_dir):
|
| 48 |
+
checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
|
| 49 |
+
checkpoints = [d for d in checkpoints if os.path.isdir(d)]
|
| 50 |
+
if checkpoints:
|
| 51 |
+
latest_checkpoint = max(checkpoints, key=os.path.getmtime)
|
| 52 |
+
|
| 53 |
+
if latest_checkpoint:
|
| 54 |
+
print("Loading model from:", latest_checkpoint)
|
| 55 |
+
return TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
|
| 56 |
+
else:
|
| 57 |
+
print("No checkpoint found, loading default model")
|
| 58 |
+
return TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
|
| 59 |
+
|
| 60 |
+
# Load the model
|
| 61 |
+
model = load_model('model_checkpoints-small-on-1mill-dp')
|
| 62 |
+
|
| 63 |
+
# Function to prepare text for prediction
|
| 64 |
+
def prepare_text(text, tokenizer, max_length=200):
|
| 65 |
+
inputs = tokenizer.encode(text, return_tensors="tf", max_length=max_length, truncation=True)
|
| 66 |
+
return inputs
|
| 67 |
+
|
| 68 |
+
# Function to generate prediction with adjustable settings
|
| 69 |
+
def generate_prediction(text, model, tokenizer, max_length=2000, num_beams=5):
|
| 70 |
+
input_ids = prepare_text(text, tokenizer, max_length=max_length)
|
| 71 |
+
output_ids = model.generate(
|
| 72 |
+
input_ids,
|
| 73 |
+
max_length=max_length,
|
| 74 |
+
num_beams=num_beams,
|
| 75 |
+
no_repeat_ngram_size=2,
|
| 76 |
+
early_stopping=True
|
| 77 |
+
)
|
| 78 |
+
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# Example Prediction
|
| 82 |
+
sample_text = "Hi , how are you?"
|
| 83 |
+
# sample_text = "Guide us to the straight path"
|
| 84 |
+
prediction = generate_prediction(sample_text, model, tokenizer)
|
| 85 |
+
print("Prediction:", prediction)
|