First commit with model

Browse files

Files changed (5) hide show

Retraining_pipeline.py +130 -0
model_checkpoints-small-on-1mill-dp/final_model/config.json +31 -0
model_checkpoints-small-on-1mill-dp/final_model/generation_config.json +7 -0
model_checkpoints-small-on-1mill-dp/final_model/tf_model.h5 +3 -0
prediction_pipeline.py +85 -0

Retraining_pipeline.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import tensorflow as tf
+import os
+import pandas as pd
+from tqdm import tqdm
+from datasets import Dataset
+from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq
+from tensorflow.keras.optimizers import Adam
+# Flag to control the training
+train_flag = False  # Set this to False if you don't want to train
+# Check for an existing model checkpoint
+checkpoint_dir = 'model_checkpoints'
+MAX_LENGTH = 2000
+# Set up tokenizer and model
+tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
+# Custom callback for saving the model
+class CustomModelCheckpoint(tf.keras.callbacks.Callback):
+    def __init__(self, checkpoint_dir, save_freq=1):
+        super(CustomModelCheckpoint, self).__init__()
+        self.checkpoint_dir = checkpoint_dir
+        self.save_freq = save_freq
+    def on_epoch_end(self, epoch, logs=None):
+        if (epoch + 1) % self.save_freq == 0:
+            path = os.path.join(self.checkpoint_dir, f"checkpoint-{epoch + 1}")
+            if not os.path.exists(path):
+                os.makedirs(path)
+            self.model.save_pretrained(path)
+# Load or initialize model
+latest_checkpoint = None
+if os.path.exists(checkpoint_dir):
+    checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
+    checkpoints = [d for d in checkpoints if os.path.isdir(d)]
+    if checkpoints:
+        latest_checkpoint = max(checkpoints, key=os.path.getmtime)
+if latest_checkpoint:
+    print("Resuming...")
+    model = TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
+else:
+    model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+# # Load your data
+# file_path1 = 'train.en'
+# file_path2 = 'train.hi'
+# lines1, lines2 = [], []
+# # Read data from files
+# with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
+#     for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
+#         try:
+#             line1_clean = line1.strip()
+#             line2_clean = line2.strip()
+#             lines1.append(line1_clean)
+#             lines2.append(line2_clean)
+#         except Exception as e:
+#             continue
+# # Create DataFrame
+# df = pd.DataFrame({
+#     'Text': lines1,
+#     'Expected': lines2
+# })
+# df = df.reset_index(drop=True)
+# df = df.sample(frac=0.1)
+df = pd.read_pickle("srt_scapper/Quran/fin_data.nas")
+df.columns = ["Text","Expected"]
+# Convert DataFrame to Hugging Face dataset format
+dataset = Dataset.from_pandas(df)
+dataset = dataset.shuffle(seed=42)
+# Tokenization and data preparation with increased max_length
+def preprocess_function(examples):
+    # Increased max_length to 2000 to handle longer sentences
+    padding = "max_length"
+    max_length = 2000  # Increase max_length here
+    inputs = examples["Text"]
+    targets = examples["Expected"]
+    model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
+    labels = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+train_dataset = dataset.map(preprocess_function, batched=True, desc="Running tokenizer")
+data_collator = DataCollatorForSeq2Seq(
+    tokenizer,
+    model=model,
+    label_pad_token_id=tokenizer.pad_token_id,
+    pad_to_multiple_of=64,
+    return_tensors="tf"
+)
+# Prepare dataset for training
+tf_train_dataset = model.prepare_tf_dataset(
+    train_dataset,
+    collate_fn=data_collator,
+    batch_size=4,  # Consider reducing batch size if memory issues occur
+    shuffle=True
+)
+# Compile the model
+model.compile(optimizer=Adam(3e-5))
+# Callbacks for training
+early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
+# Ensure the checkpoint directory exists
+if not os.path.exists(checkpoint_dir):
+    os.makedirs(checkpoint_dir)
+# Custom model checkpoint
+model_checkpoint = CustomModelCheckpoint(
+    checkpoint_dir,
+    save_freq=1  # Save after every epoch
+)
+# Fit the model
+model.fit(
+    tf_train_dataset,
+    epochs=10,
+    callbacks=[early_stopping, model_checkpoint]
+)
+# Saving the final model (optional)
+model.save_pretrained(os.path.join(checkpoint_dir, 'final_model'))

model_checkpoints-small-on-1mill-dp/final_model/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "google/mt5-small",
+  "architectures": [
+    "MT5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 1024,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "mt5",
+  "num_decoder_layers": 8,
+  "num_heads": 6,
+  "num_layers": 8,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "T5Tokenizer",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "vocab_size": 250112
+}

model_checkpoints-small-on-1mill-dp/final_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.38.2"
+}

model_checkpoints-small-on-1mill-dp/final_model/tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80e4a4809a5644608a697c12a61d9ad480d1ee1ac29e16b4cfe48bc44da71793
+size 2225556280

prediction_pipeline.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/usr/bin/env python
+# coding: utf-8
+import tensorflow as tf
+import os
+from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer
+import pandas as pd
+from datasets import Dataset
+from tqdm import tqdm
+# Set up tokenizer
+tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
+# # Load your data
+# file_path1 = 'train.en'
+# file_path2 = 'train.hi'
+# lines1, lines2 = [], []
+# # Read data from files
+# with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
+#     for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
+#         try:
+#             line1_clean = line1.strip()
+#             line2_clean = line2.strip()
+#             lines1.append(line1_clean)
+#             lines2.append(line2_clean)
+#         except Exception as e:
+#             continue
+# # Create DataFrame
+# df = pd.DataFrame({
+#     'Text': lines1,
+#     'Expected': lines2
+# })
+# df = df.reset_index(drop=True)
+# df = df.sample(frac=0.1)
+# # Convert DataFrame to Hugging Face dataset format
+# dataset = Dataset.from_pandas(df)
+# dataset = dataset.shuffle(seed=42)
+# Function to load the model from the latest checkpoint
+def load_model(checkpoint_dir):
+    latest_checkpoint = None
+    if os.path.exists(checkpoint_dir):
+        checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
+        checkpoints = [d for d in checkpoints if os.path.isdir(d)]
+        if checkpoints:
+            latest_checkpoint = max(checkpoints, key=os.path.getmtime)
+    if latest_checkpoint:
+        print("Loading model from:", latest_checkpoint)
+        return TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
+    else:
+        print("No checkpoint found, loading default model")
+        return TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+# Load the model
+model = load_model('model_checkpoints-small-on-1mill-dp')
+# Function to prepare text for prediction
+def prepare_text(text, tokenizer, max_length=200):
+    inputs = tokenizer.encode(text, return_tensors="tf", max_length=max_length, truncation=True)
+    return inputs
+# Function to generate prediction with adjustable settings
+def generate_prediction(text, model, tokenizer, max_length=2000, num_beams=5):
+    input_ids = prepare_text(text, tokenizer, max_length=max_length)
+    output_ids = model.generate(
+        input_ids,
+        max_length=max_length,
+        num_beams=num_beams,
+        no_repeat_ngram_size=2,
+        early_stopping=True
+    )
+    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
+# Example Prediction
+sample_text = "Hi , how are you?"
+# sample_text = "Guide us to the straight path"
+prediction = generate_prediction(sample_text, model, tokenizer)
+print("Prediction:", prediction)