EnglishToHindiTranslationLongContext / prediction_pipeline.py
nashit93's picture
Resolved merge conflicts
7cc443f
<<<<<<< HEAD
#!/usr/bin/env python
# coding: utf-8
import tensorflow as tf
import os
from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
# Set up tokenizer
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
# # Load your data
# file_path1 = 'train.en'
# file_path2 = 'train.hi'
# lines1, lines2 = [], []
# # Read data from files
# with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
# for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
# try:
# line1_clean = line1.strip()
# line2_clean = line2.strip()
# lines1.append(line1_clean)
# lines2.append(line2_clean)
# except Exception as e:
# continue
# # Create DataFrame
# df = pd.DataFrame({
# 'Text': lines1,
# 'Expected': lines2
# })
# df = df.reset_index(drop=True)
# df = df.sample(frac=0.1)
# # Convert DataFrame to Hugging Face dataset format
# dataset = Dataset.from_pandas(df)
# dataset = dataset.shuffle(seed=42)
# Function to load the model from the latest checkpoint
def load_model(checkpoint_dir):
latest_checkpoint = None
if os.path.exists(checkpoint_dir):
checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
checkpoints = [d for d in checkpoints if os.path.isdir(d)]
if checkpoints:
latest_checkpoint = max(checkpoints, key=os.path.getmtime)
if latest_checkpoint:
print("Loading model from:", latest_checkpoint)
return TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
else:
print("No checkpoint found, loading default model")
return TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
# Load the model
model = load_model('model_checkpoints-small-on-1mill-dp')
# Function to prepare text for prediction
def prepare_text(text, tokenizer, max_length=200):
inputs = tokenizer.encode(text, return_tensors="tf", max_length=max_length, truncation=True)
return inputs
# Function to generate prediction with adjustable settings
def generate_prediction(text, model, tokenizer, max_length=2000, num_beams=5):
input_ids = prepare_text(text, tokenizer, max_length=max_length)
output_ids = model.generate(
input_ids,
max_length=max_length,
num_beams=num_beams,
no_repeat_ngram_size=2,
early_stopping=True
)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Example Prediction
sample_text = "Hi , how are you?"
# sample_text = "Guide us to the straight path"
prediction = generate_prediction(sample_text, model, tokenizer)
print("Prediction:", prediction)
=======
#!/usr/bin/env python
# coding: utf-8
import tensorflow as tf
import os
from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
# Set up tokenizer
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
# # Load your data
# file_path1 = 'train.en'
# file_path2 = 'train.hi'
# lines1, lines2 = [], []
# # Read data from files
# with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
# for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
# try:
# line1_clean = line1.strip()
# line2_clean = line2.strip()
# lines1.append(line1_clean)
# lines2.append(line2_clean)
# except Exception as e:
# continue
# # Create DataFrame
# df = pd.DataFrame({
# 'Text': lines1,
# 'Expected': lines2
# })
# df = df.reset_index(drop=True)
# df = df.sample(frac=0.1)
# # Convert DataFrame to Hugging Face dataset format
# dataset = Dataset.from_pandas(df)
# dataset = dataset.shuffle(seed=42)
# Function to load the model from the latest checkpoint
def load_model(checkpoint_dir):
latest_checkpoint = None
if os.path.exists(checkpoint_dir):
checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
checkpoints = [d for d in checkpoints if os.path.isdir(d)]
if checkpoints:
latest_checkpoint = max(checkpoints, key=os.path.getmtime)
if latest_checkpoint:
print("Loading model from:", latest_checkpoint)
return TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
else:
print("No checkpoint found, loading default model")
return TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
# Load the model
model = load_model('model_checkpoints-small-on-1mill-dp')
# Function to prepare text for prediction
def prepare_text(text, tokenizer, max_length=200):
inputs = tokenizer.encode(text, return_tensors="tf", max_length=max_length, truncation=True)
return inputs
# Function to generate prediction with adjustable settings
def generate_prediction(text, model, tokenizer, max_length=2000, num_beams=5):
input_ids = prepare_text(text, tokenizer, max_length=max_length)
output_ids = model.generate(
input_ids,
max_length=max_length,
num_beams=num_beams,
no_repeat_ngram_size=2,
early_stopping=True
)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Example Prediction
sample_text = "Hi , how are you?"
# sample_text = "Guide us to the straight path"
prediction = generate_prediction(sample_text, model, tokenizer)
print("Prediction:", prediction)
>>>>>>> origin/main