Spaces:

Kasamuday
/

object

Runtime error

App Files Files Community

object / models /official /recommendation /data_test.py

Kasamuday

Upload 4260 files

f3507ef verified almost 2 years ago

raw

history blame contribute delete

13.2 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Test NCF data pipeline."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	from collections import defaultdict
	import hashlib
	import os

	import mock

	import numpy as np
	import scipy.stats
	import tensorflow as tf, tf_keras

	from official.recommendation import constants as rconst
	from official.recommendation import data_preprocessing
	from official.recommendation import movielens
	from official.recommendation import popen_helper

	DATASET = "ml-test"
	NUM_USERS = 1000
	NUM_ITEMS = 2000
	NUM_PTS = 50000
	BATCH_SIZE = 2048
	EVAL_BATCH_SIZE = 4000
	NUM_NEG = 4

	END_TO_END_TRAIN_MD5 = "b218738e915e825d03939c5e305a2698"
	END_TO_END_EVAL_MD5 = "d753d0f3186831466d6e218163a9501e"
	FRESH_RANDOMNESS_MD5 = "63d0dff73c0e5f1048fbdc8c65021e22"


	def mock_download(args, *kwargs):
	return


	# The forkpool used by data producers interacts badly with the threading
	# used by TestCase. Without this patch tests will hang, and no amount
	# of diligent closing and joining within the producer will prevent it.
	@mock.patch.object(popen_helper, "get_forkpool", popen_helper.get_fauxpool)
	class BaseTest(tf.test.TestCase):

	def setUp(self):
	tf.compat.v1.disable_eager_execution()
	self.temp_data_dir = self.get_temp_dir()
	ratings_folder = os.path.join(self.temp_data_dir, DATASET)
	tf.io.gfile.makedirs(ratings_folder)
	np.random.seed(0)
	raw_user_ids = np.arange(NUM_USERS * 3)
	np.random.shuffle(raw_user_ids)
	raw_user_ids = raw_user_ids[:NUM_USERS]

	raw_item_ids = np.arange(NUM_ITEMS * 3)
	np.random.shuffle(raw_item_ids)
	raw_item_ids = raw_item_ids[:NUM_ITEMS]

	users = np.random.choice(raw_user_ids, NUM_PTS)
	items = np.random.choice(raw_item_ids, NUM_PTS)
	scores = np.random.randint(low=0, high=5, size=NUM_PTS)
	times = np.random.randint(low=1000000000, high=1200000000, size=NUM_PTS)

	self.rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE)
	self.seen_pairs = set()
	self.holdout = {}
	with tf.io.gfile.GFile(self.rating_file, "w") as f:
	f.write("user_id,item_id,rating,timestamp\n")
	for usr, itm, scr, ts in zip(users, items, scores, times):
	pair = (usr, itm)
	if pair in self.seen_pairs:
	continue
	self.seen_pairs.add(pair)
	if usr not in self.holdout or (ts, itm) > self.holdout[usr]:
	self.holdout[usr] = (ts, itm)

	f.write("{},{},{},{}\n".format(usr, itm, scr, ts))

	movielens.download = mock_download
	movielens.NUM_RATINGS[DATASET] = NUM_PTS
	movielens.DATASET_TO_NUM_USERS_AND_ITEMS[DATASET] = (NUM_USERS, NUM_ITEMS)

	def make_params(self, train_epochs=1):
	return {
	"train_epochs": train_epochs,
	"batches_per_step": 1,
	"use_seed": False,
	"batch_size": BATCH_SIZE,
	"eval_batch_size": EVAL_BATCH_SIZE,
	"num_neg": NUM_NEG,
	"match_mlperf": True,
	"use_tpu": False,
	"use_xla_for_gpu": False,
	"stream_files": False,
	}

	def test_preprocessing(self):
	# For the most part the necessary checks are performed within
	# _filter_index_sort()

	cache_path = os.path.join(self.temp_data_dir, "test_cache.pickle")
	data, valid_cache = data_preprocessing._filter_index_sort(
	self.rating_file, cache_path=cache_path)

	assert len(data[rconst.USER_MAP]) == NUM_USERS
	assert len(data[rconst.ITEM_MAP]) == NUM_ITEMS

	def drain_dataset(self, dataset, g):
	# type: (tf.data.Dataset, tf.Graph) -> list
	with self.session(graph=g) as sess:
	with g.as_default():
	batch = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
	output = []
	while True:
	try:
	output.append(sess.run(batch))
	except tf.errors.OutOfRangeError:
	break
	return output

	def _test_end_to_end(self, constructor_type):
	params = self.make_params(train_epochs=1)
	_, _, producer = data_preprocessing.instantiate_pipeline(
	dataset=DATASET,
	data_dir=self.temp_data_dir,
	params=params,
	constructor_type=constructor_type,
	deterministic=True)

	producer.start()
	producer.join()
	assert producer._fatal_exception is None

	user_inv_map = {v: k for k, v in producer.user_map.items()}
	item_inv_map = {v: k for k, v in producer.item_map.items()}

	# ==========================================================================
	# == Training Data =========================================================
	# ==========================================================================
	g = tf.Graph()
	with g.as_default():
	input_fn = producer.make_input_fn(is_training=True)
	dataset = input_fn(params)

	first_epoch = self.drain_dataset(dataset=dataset, g=g)

	counts = defaultdict(int)
	train_examples = {
	True: set(),
	False: set(),
	}

	md5 = hashlib.md5()
	for features, labels in first_epoch:
	data_list = [
	features[movielens.USER_COLUMN].flatten(),
	features[movielens.ITEM_COLUMN].flatten(),
	features[rconst.VALID_POINT_MASK].flatten(),
	labels.flatten()
	]
	for i in data_list:
	md5.update(i.tobytes())

	for u, i, v, l in zip(*data_list):
	if not v:
	continue # ignore padding

	u_raw = user_inv_map[u]
	i_raw = item_inv_map[i]
	if ((u_raw, i_raw) in self.seen_pairs) != l:
	# The evaluation item is not considered during false negative
	# generation, so it will occasionally appear as a negative example
	# during training.
	assert not l
	self.assertEqual(i_raw, self.holdout[u_raw][1])
	train_examples[l].add((u_raw, i_raw))
	counts[(u_raw, i_raw)] += 1

	self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5)

	num_positives_seen = len(train_examples[True])
	self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen)

	# This check is more heuristic because negatives are sampled with
	# replacement. It only checks that negative generation is reasonably random.
	self.assertGreater(
	len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9)

	# This checks that the samples produced are independent by checking the
	# number of duplicate entries. If workers are not properly independent there
	# will be lots of repeated pairs.
	self.assertLess(np.mean(list(counts.values())), 1.1)

	# ==========================================================================
	# == Eval Data =============================================================
	# ==========================================================================
	with g.as_default():
	input_fn = producer.make_input_fn(is_training=False)
	dataset = input_fn(params)

	eval_data = self.drain_dataset(dataset=dataset, g=g)

	current_user = None
	md5 = hashlib.md5()
	for features in eval_data:
	data_list = [
	features[movielens.USER_COLUMN].flatten(),
	features[movielens.ITEM_COLUMN].flatten(),
	features[rconst.DUPLICATE_MASK].flatten()
	]
	for i in data_list:
	md5.update(i.tobytes())

	for idx, (u, i, d) in enumerate(zip(*data_list)):
	u_raw = user_inv_map[u]
	i_raw = item_inv_map[i]
	if current_user is None:
	current_user = u

	# Ensure that users appear in blocks, as the evaluation logic expects
	# this structure.
	self.assertEqual(u, current_user)

	# The structure of evaluation data is 999 negative examples followed
	# by the holdout positive.
	if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1):
	# Check that the last element in each chunk is the holdout item.
	self.assertEqual(i_raw, self.holdout[u_raw][1])
	current_user = None

	elif i_raw == self.holdout[u_raw][1]:
	# Because the holdout item is not given to the negative generation
	# process, it can appear as a negative. In that case, it should be
	# masked out as a duplicate. (Since the true positive is placed at
	# the end and would therefore lose the tie.)
	assert d

	else:
	# Otherwise check that the other 999 points for a user are selected
	# from the negatives.
	assert (u_raw, i_raw) not in self.seen_pairs

	self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)

	def _test_fresh_randomness(self, constructor_type):
	train_epochs = 5
	params = self.make_params(train_epochs=train_epochs)
	_, _, producer = data_preprocessing.instantiate_pipeline(
	dataset=DATASET,
	data_dir=self.temp_data_dir,
	params=params,
	constructor_type=constructor_type,
	deterministic=True)

	producer.start()

	results = []
	g = tf.Graph()
	with g.as_default():
	for _ in range(train_epochs):
	input_fn = producer.make_input_fn(is_training=True)
	dataset = input_fn(params)
	results.extend(self.drain_dataset(dataset=dataset, g=g))

	producer.join()
	assert producer._fatal_exception is None

	positive_counts, negative_counts = defaultdict(int), defaultdict(int)
	md5 = hashlib.md5()
	for features, labels in results:
	data_list = [
	features[movielens.USER_COLUMN].flatten(),
	features[movielens.ITEM_COLUMN].flatten(),
	features[rconst.VALID_POINT_MASK].flatten(),
	labels.flatten()
	]
	for i in data_list:
	md5.update(i.tobytes())

	for u, i, v, l in zip(*data_list):
	if not v:
	continue # ignore padding

	if l:
	positive_counts[(u, i)] += 1
	else:
	negative_counts[(u, i)] += 1

	self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5)

	# The positive examples should appear exactly once each epoch
	self.assertAllEqual(
	list(positive_counts.values()), [train_epochs for _ in positive_counts])

	# The threshold for the negatives is heuristic, but in general repeats are
	# expected, but should not appear too frequently.

	pair_cardinality = NUM_USERS * NUM_ITEMS
	neg_pair_cardinality = pair_cardinality - len(self.seen_pairs)

	# Approximation for the expectation number of times that a particular
	# negative will appear in a given epoch. Implicit in this calculation is the
	# treatment of all negative pairs as equally likely. Normally is not
	# necessarily reasonable; however the generation in self.setUp() will
	# approximate this behavior sufficiently for heuristic testing.
	e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality

	# The frequency of occurance of a given negative pair should follow an
	# approximately binomial distribution in the limit that the cardinality of
	# the negative pair set >> number of samples per epoch.
	approx_pdf = scipy.stats.binom.pmf(
	k=np.arange(train_epochs + 1), n=train_epochs, p=e_sample)

	# Tally the actual observed counts.
	count_distribution = [0 for _ in range(train_epochs + 1)]
	for i in negative_counts.values():
	i = min([i, train_epochs]) # round down tail for simplicity.
	count_distribution[i] += 1
	count_distribution[0] = neg_pair_cardinality - sum(count_distribution[1:])

	# Check that the frequency of negative pairs is approximately binomial.
	for i in range(train_epochs + 1):
	if approx_pdf[i] < 0.05:
	continue # Variance will be high at the tails.

	observed_fraction = count_distribution[i] / neg_pair_cardinality
	deviation = (2 * abs(observed_fraction - approx_pdf[i]) /
	(observed_fraction + approx_pdf[i]))

	self.assertLess(deviation, 0.2)

	def test_end_to_end_materialized(self):
	self._test_end_to_end("materialized")

	def test_end_to_end_bisection(self):
	self._test_end_to_end("bisection")

	def test_fresh_randomness_materialized(self):
	self._test_fresh_randomness("materialized")

	def test_fresh_randomness_bisection(self):
	self._test_fresh_randomness("bisection")


	if __name__ == "__main__":
	tf.test.main()