Spaces:

palondomus
/

CaesarAINL

Sleeping

CaesarAINL / caesarcomplete /berttest.py

CaesarCloudSync

CaesarAINL Deployed Maybe

baa1558 almost 3 years ago

1.78 kB

	import tensorflow_hub as hub

	BERT_URL = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
	module = hub.Module(BERT_URL)

	# Look at the descriptor. This would tell you the model name
	# cat $TFHUB_CACHE_DIR/ecd2596ce849110246602e3d4d81e2d9719cb027.descriptor.txt

	# Further look at the assets folder, this has the file `vocab.txt`
	# ls $TFHUB_CACHE_DIR/ecd2596ce849110246602e3d4d81e2d9719cb027/assets


	import tokenization

	def create_tokenizer(vocab_file, do_lower_case=False):
	return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

	tokenizer = create_tokenizer('vocab.txt', do_lower_case=False)


	def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
	tokens = ['[CLS]']
	tokens.extend(tokenizer.tokenize(sentence))
	if len(tokens) > max_seq_len-1:
	tokens = tokens[:max_seq_len-1]
	tokens.append('[SEP]')

	segment_ids = [0] * len(tokens)
	input_ids = tokenizer.convert_tokens_to_ids(tokens)
	input_mask = [1] * len(input_ids)

	#Zero Mask till seq_length
	zero_mask = [0] * (max_seq_len-len(tokens))
	input_ids.extend(zero_mask)
	input_mask.extend(zero_mask)
	segment_ids.extend(zero_mask)

	return input_ids, input_mask, segment_ids

	def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
	all_input_ids = []
	all_input_mask = []
	all_segment_ids = []

	for sentence in sentences:
	input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
	all_input_ids.append(input_ids)
	all_input_mask.append(input_mask)
	all_segment_ids.append(segment_ids)

	return all_input_ids, all_input_mask, all_segment_ids