| | from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering |
| | import tensorflow as tf |
| | import numpy as np |
| |
|
| | checkpoint = "distilbert-base-cased-distilled-squad" |
| | tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
| | model = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint) |
| |
|
| | def question_answering_tf(question, context): |
| | inputs = tokenizer(question, |
| | context, |
| | max_length=384, |
| | stride=50, |
| | truncation='only_second', |
| | padding=True, |
| | return_overflowing_tokens=True, |
| | return_offsets_mapping=True, |
| | return_tensors="tf") |
| | |
| | _ = inputs.pop("overflow_to_sample_mapping") |
| | offset_mapping = inputs.pop("offset_mapping") |
| | |
| | outputs = model(inputs) |
| |
|
| | start_logits = outputs.start_logits |
| | end_logits = outputs.end_logits |
| |
|
| | |
| | sequence_ids = inputs.sequence_ids() |
| | mask = [i != 1 for i in sequence_ids] |
| | mask[0] = False |
| | mask = tf.math.logical_or(tf.constant(mask)[None], inputs["attention_mask"] == 0) |
| |
|
| | start_logits = tf.where(mask, -10000, start_logits) |
| | end_logits = tf.where(mask, -10000, end_logits) |
| |
|
| | |
| | start_probabilities = tf.nn.softmax(start_logits, axis=-1).numpy() |
| | end_probabilities = tf.nn.softmax(end_logits, axis=-1).numpy() |
| |
|
| | |
| | max_score = 0.0 |
| | start_token = 0 |
| | end_token = 0 |
| | offset_index = 0 |
| | |
| | for i, probs in enumerate(zip(start_probabilities, end_probabilities)): |
| | sp, ep = probs |
| | scores = sp[:,np.newaxis] * ep[np.newaxis,:] |
| | index = np.triu(scores).argmax().item() |
| | row = index // scores.shape[1] |
| | col = index % scores.shape[1] |
| | score = scores[row][col] |
| | if(score > max_score): |
| | max_score = score |
| | start_token = row |
| | end_token = col |
| | offset_index = i |
| |
|
| | |
| | start_char, _ = offset_mapping[offset_index][start_token] |
| | _, end_char = offset_mapping[offset_index][end_token] |
| | return context[start_char:end_char] |