Resolved merge conflicts

7cc443f about 2 years ago

5.63 kB

	<<<<<<< HEAD
	#!/usr/bin/env python
	# coding: utf-8

	import tensorflow as tf
	import os
	from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer
	import pandas as pd
	from datasets import Dataset
	from tqdm import tqdm
	# Set up tokenizer
	tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")



	# # Load your data
	# file_path1 = 'train.en'
	# file_path2 = 'train.hi'
	# lines1, lines2 = [], []

	# # Read data from files
	# with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
	# for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
	# try:
	# line1_clean = line1.strip()
	# line2_clean = line2.strip()
	# lines1.append(line1_clean)
	# lines2.append(line2_clean)
	# except Exception as e:
	# continue

	# # Create DataFrame
	# df = pd.DataFrame({
	# 'Text': lines1,
	# 'Expected': lines2
	# })
	# df = df.reset_index(drop=True)
	# df = df.sample(frac=0.1)
	# # Convert DataFrame to Hugging Face dataset format
	# dataset = Dataset.from_pandas(df)
	# dataset = dataset.shuffle(seed=42)



	# Function to load the model from the latest checkpoint
	def load_model(checkpoint_dir):
	latest_checkpoint = None
	if os.path.exists(checkpoint_dir):
	checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
	checkpoints = [d for d in checkpoints if os.path.isdir(d)]
	if checkpoints:
	latest_checkpoint = max(checkpoints, key=os.path.getmtime)

	if latest_checkpoint:
	print("Loading model from:", latest_checkpoint)
	return TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
	else:
	print("No checkpoint found, loading default model")
	return TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")

	# Load the model
	model = load_model('model_checkpoints-small-on-1mill-dp')

	# Function to prepare text for prediction
	def prepare_text(text, tokenizer, max_length=200):
	inputs = tokenizer.encode(text, return_tensors="tf", max_length=max_length, truncation=True)
	return inputs

	# Function to generate prediction with adjustable settings
	def generate_prediction(text, model, tokenizer, max_length=2000, num_beams=5):
	input_ids = prepare_text(text, tokenizer, max_length=max_length)
	output_ids = model.generate(
	input_ids,
	max_length=max_length,
	num_beams=num_beams,
	no_repeat_ngram_size=2,
	early_stopping=True
	)
	return tokenizer.decode(output_ids[0], skip_special_tokens=True)


	# Example Prediction
	sample_text = "Hi , how are you?"
	# sample_text = "Guide us to the straight path"
	prediction = generate_prediction(sample_text, model, tokenizer)
	print("Prediction:", prediction)
	=======
	#!/usr/bin/env python
	# coding: utf-8

	import tensorflow as tf
	import os
	from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer
	import pandas as pd
	from datasets import Dataset
	from tqdm import tqdm
	# Set up tokenizer
	tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")



	# # Load your data
	# file_path1 = 'train.en'
	# file_path2 = 'train.hi'
	# lines1, lines2 = [], []

	# # Read data from files
	# with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2:
	# for line1, line2 in tqdm(zip(file1, file2), desc="Reading Data"):
	# try:
	# line1_clean = line1.strip()
	# line2_clean = line2.strip()
	# lines1.append(line1_clean)
	# lines2.append(line2_clean)
	# except Exception as e:
	# continue

	# # Create DataFrame
	# df = pd.DataFrame({
	# 'Text': lines1,
	# 'Expected': lines2
	# })
	# df = df.reset_index(drop=True)
	# df = df.sample(frac=0.1)
	# # Convert DataFrame to Hugging Face dataset format
	# dataset = Dataset.from_pandas(df)
	# dataset = dataset.shuffle(seed=42)



	# Function to load the model from the latest checkpoint
	def load_model(checkpoint_dir):
	latest_checkpoint = None
	if os.path.exists(checkpoint_dir):
	checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
	checkpoints = [d for d in checkpoints if os.path.isdir(d)]
	if checkpoints:
	latest_checkpoint = max(checkpoints, key=os.path.getmtime)

	if latest_checkpoint:
	print("Loading model from:", latest_checkpoint)
	return TFMT5ForConditionalGeneration.from_pretrained(latest_checkpoint)
	else:
	print("No checkpoint found, loading default model")
	return TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")

	# Load the model
	model = load_model('model_checkpoints-small-on-1mill-dp')

	# Function to prepare text for prediction
	def prepare_text(text, tokenizer, max_length=200):
	inputs = tokenizer.encode(text, return_tensors="tf", max_length=max_length, truncation=True)
	return inputs

	# Function to generate prediction with adjustable settings
	def generate_prediction(text, model, tokenizer, max_length=2000, num_beams=5):
	input_ids = prepare_text(text, tokenizer, max_length=max_length)
	output_ids = model.generate(
	input_ids,
	max_length=max_length,
	num_beams=num_beams,
	no_repeat_ngram_size=2,
	early_stopping=True
	)
	return tokenizer.decode(output_ids[0], skip_special_tokens=True)


	# Example Prediction
	sample_text = "Hi , how are you?"
	# sample_text = "Guide us to the straight path"
	prediction = generate_prediction(sample_text, model, tokenizer)
	print("Prediction:", prediction)
	>>>>>>> origin/main