wesfggfd
/

Sequence_Models

Model card Files Files and versions

Metrics Training metrics Community

Sequence_Models / NLP with Attention Models /Text_Summarization /Summarization /tf /utils.py

wesfggfd's picture

Upload 145 files

9f2ab4b verified 7 months ago

history blame contribute delete

1.25 kB

	import pandas as pd
	import re


	def get_train_test_data(data_dir):
	# Get the train data
	train_data = pd.read_json(f"{data_dir}/train.json")
	train_data.drop(['id'], axis=1, inplace=True)

	# Get the test data
	test_data = pd.read_json(f"{data_dir}/test.json")
	test_data.drop(['id'], axis=1, inplace=True)

	return train_data, test_data


	def preprocess(input_data):
	# Define the custom preprocessing function
	def preprocess_util(input_data):
	# Convert all text to lowercase
	lowercase = input_data.lower()
	# Remove newlines and double spaces
	removed_newlines = re.sub("\n\|\r\|\t", " ", lowercase)
	removed_double_spaces = ' '.join(removed_newlines.split(' '))
	# Add start of sentence and end of sentence tokens
	s = '[SOS] ' + removed_double_spaces + ' [EOS]'
	return s

	# Apply the preprocessing to the train and test datasets
	input_data['summary'] = input_data.apply(lambda row : preprocess_util(row['summary']), axis = 1)
	input_data['dialogue'] = input_data.apply(lambda row : preprocess_util(row['dialogue']), axis = 1)

	document = input_data['dialogue']
	summary = input_data['summary']

	return document, summary