Spaces:
Sleeping
Sleeping
| import tensorflow_hub as hub | |
| BERT_URL = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1' | |
| module = hub.Module(BERT_URL) | |
| # Look at the descriptor. This would tell you the model name | |
| # cat $TFHUB_CACHE_DIR/ecd2596ce849110246602e3d4d81e2d9719cb027.descriptor.txt | |
| # Further look at the assets folder, this has the file `vocab.txt` | |
| # ls $TFHUB_CACHE_DIR/ecd2596ce849110246602e3d4d81e2d9719cb027/assets | |
| import tokenization | |
| def create_tokenizer(vocab_file, do_lower_case=False): | |
| return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) | |
| tokenizer = create_tokenizer('vocab.txt', do_lower_case=False) | |
| def convert_sentence_to_features(sentence, tokenizer, max_seq_len): | |
| tokens = ['[CLS]'] | |
| tokens.extend(tokenizer.tokenize(sentence)) | |
| if len(tokens) > max_seq_len-1: | |
| tokens = tokens[:max_seq_len-1] | |
| tokens.append('[SEP]') | |
| segment_ids = [0] * len(tokens) | |
| input_ids = tokenizer.convert_tokens_to_ids(tokens) | |
| input_mask = [1] * len(input_ids) | |
| #Zero Mask till seq_length | |
| zero_mask = [0] * (max_seq_len-len(tokens)) | |
| input_ids.extend(zero_mask) | |
| input_mask.extend(zero_mask) | |
| segment_ids.extend(zero_mask) | |
| return input_ids, input_mask, segment_ids | |
| def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20): | |
| all_input_ids = [] | |
| all_input_mask = [] | |
| all_segment_ids = [] | |
| for sentence in sentences: | |
| input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len) | |
| all_input_ids.append(input_ids) | |
| all_input_mask.append(input_mask) | |
| all_segment_ids.append(segment_ids) | |
| return all_input_ids, all_input_mask, all_segment_ids |