| from ...torch_core import * |
| from ...layers import * |
| from ...train import ClassificationInterpretation |
| from ...basic_train import * |
| from ...basic_data import * |
| from ..data import TextClasDataBunch |
| import matplotlib.cm as cm |
|
|
| __all__ = ['EmbeddingDropout', 'LinearDecoder', 'AWD_LSTM', 'RNNDropout', |
| 'SequentialRNN', 'WeightDropout', 'dropout_mask', 'awd_lstm_lm_split', 'awd_lstm_clas_split', |
| 'awd_lstm_lm_config', 'awd_lstm_clas_config', 'TextClassificationInterpretation'] |
|
|
| def dropout_mask(x:Tensor, sz:Collection[int], p:float): |
| "Return a dropout mask of the same type as `x`, size `sz`, with probability `p` to cancel an element." |
| return x.new(*sz).bernoulli_(1-p).div_(1-p) |
|
|
| class RNNDropout(Module): |
| "Dropout with probability `p` that is consistent on the seq_len dimension." |
|
|
| def __init__(self, p:float=0.5): self.p=p |
|
|
| def forward(self, x:Tensor)->Tensor: |
| if not self.training or self.p == 0.: return x |
| m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p) |
| return x * m |
|
|
| class WeightDropout(Module): |
| "A module that warps another layer in which some weights will be replaced by 0 during training." |
|
|
| def __init__(self, module:nn.Module, weight_p:float, layer_names:Collection[str]=['weight_hh_l0']): |
| self.module,self.weight_p,self.layer_names = module,weight_p,layer_names |
| for layer in self.layer_names: |
| |
| w = getattr(self.module, layer) |
| self.register_parameter(f'{layer}_raw', nn.Parameter(w.data)) |
| self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False) |
|
|
| def _setweights(self): |
| "Apply dropout to the raw weights." |
| for layer in self.layer_names: |
| raw_w = getattr(self, f'{layer}_raw') |
| self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training) |
|
|
| def forward(self, *args:ArgStar): |
| self._setweights() |
| with warnings.catch_warnings(): |
| |
| warnings.simplefilter("ignore") |
| return self.module.forward(*args) |
|
|
| def reset(self): |
| for layer in self.layer_names: |
| raw_w = getattr(self, f'{layer}_raw') |
| self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=False) |
| if hasattr(self.module, 'reset'): self.module.reset() |
|
|
| class EmbeddingDropout(Module): |
| "Apply dropout with probabily `embed_p` to an embedding layer `emb`." |
|
|
| def __init__(self, emb:nn.Module, embed_p:float): |
| self.emb,self.embed_p = emb,embed_p |
| self.pad_idx = self.emb.padding_idx |
| if self.pad_idx is None: self.pad_idx = -1 |
|
|
| def forward(self, words:LongTensor, scale:Optional[float]=None)->Tensor: |
| if self.training and self.embed_p != 0: |
| size = (self.emb.weight.size(0),1) |
| mask = dropout_mask(self.emb.weight.data, size, self.embed_p) |
| masked_embed = self.emb.weight * mask |
| else: masked_embed = self.emb.weight |
| if scale: masked_embed.mul_(scale) |
| return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm, |
| self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse) |
|
|
| class AWD_LSTM(Module): |
| "AWD-LSTM/QRNN inspired by https://arxiv.org/abs/1708.02182." |
|
|
| initrange=0.1 |
|
|
| def __init__(self, vocab_sz:int, emb_sz:int, n_hid:int, n_layers:int, pad_token:int=1, hidden_p:float=0.2, |
| input_p:float=0.6, embed_p:float=0.1, weight_p:float=0.5, qrnn:bool=False, bidir:bool=False): |
| self.bs,self.qrnn,self.emb_sz,self.n_hid,self.n_layers = 1,qrnn,emb_sz,n_hid,n_layers |
| self.n_dir = 2 if bidir else 1 |
| self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token) |
| self.encoder_dp = EmbeddingDropout(self.encoder, embed_p) |
| if self.qrnn: |
| |
| from .qrnn import QRNN |
| self.rnns = [QRNN(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.n_dir, 1, |
| save_prev_x=True, zoneout=0, window=2 if l == 0 else 1, output_gate=True, bidirectional=bidir) |
| for l in range(n_layers)] |
| for rnn in self.rnns: |
| rnn.layers[0].linear = WeightDropout(rnn.layers[0].linear, weight_p, layer_names=['weight']) |
| else: |
| self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.n_dir, 1, |
| batch_first=True, bidirectional=bidir) for l in range(n_layers)] |
| self.rnns = [WeightDropout(rnn, weight_p) for rnn in self.rnns] |
| self.rnns = nn.ModuleList(self.rnns) |
| self.encoder.weight.data.uniform_(-self.initrange, self.initrange) |
| self.input_dp = RNNDropout(input_p) |
| self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)]) |
|
|
| def forward(self, input:Tensor, from_embeddings:bool=False)->Tuple[Tensor,Tensor]: |
| if from_embeddings: bs,sl,es = input.size() |
| else: bs,sl = input.size() |
| if bs!=self.bs: |
| self.bs=bs |
| self.reset() |
| raw_output = self.input_dp(input if from_embeddings else self.encoder_dp(input)) |
| new_hidden,raw_outputs,outputs = [],[],[] |
| for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)): |
| raw_output, new_h = rnn(raw_output, self.hidden[l]) |
| new_hidden.append(new_h) |
| raw_outputs.append(raw_output) |
| if l != self.n_layers - 1: raw_output = hid_dp(raw_output) |
| outputs.append(raw_output) |
| self.hidden = to_detach(new_hidden, cpu=False) |
| return raw_outputs, outputs |
|
|
| def _one_hidden(self, l:int)->Tensor: |
| "Return one hidden state." |
| nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz) // self.n_dir |
| return one_param(self).new(self.n_dir, self.bs, nh).zero_() |
|
|
| def select_hidden(self, idxs): |
| if self.qrnn: self.hidden = [h[:,idxs,:] for h in self.hidden] |
| else: self.hidden = [(h[0][:,idxs,:],h[1][:,idxs,:]) for h in self.hidden] |
| self.bs = len(idxs) |
|
|
| def reset(self): |
| "Reset the hidden states." |
| [r.reset() for r in self.rnns if hasattr(r, 'reset')] |
| if self.qrnn: self.hidden = [self._one_hidden(l) for l in range(self.n_layers)] |
| else: self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)] |
|
|
| class LinearDecoder(Module): |
| "To go on top of a RNNCore module and create a Language Model." |
| initrange=0.1 |
|
|
| def __init__(self, n_out:int, n_hid:int, output_p:float, tie_encoder:nn.Module=None, bias:bool=True): |
| self.decoder = nn.Linear(n_hid, n_out, bias=bias) |
| self.decoder.weight.data.uniform_(-self.initrange, self.initrange) |
| self.output_dp = RNNDropout(output_p) |
| if bias: self.decoder.bias.data.zero_() |
| if tie_encoder: self.decoder.weight = tie_encoder.weight |
|
|
| def forward(self, input:Tuple[Tensor,Tensor])->Tuple[Tensor,Tensor,Tensor]: |
| raw_outputs, outputs = input |
| output = self.output_dp(outputs[-1]) |
| decoded = self.decoder(output) |
| return decoded, raw_outputs, outputs |
|
|
| class SequentialRNN(nn.Sequential): |
| "A sequential module that passes the reset call to its children." |
| def reset(self): |
| for c in self.children(): |
| if hasattr(c, 'reset'): c.reset() |
|
|
| def awd_lstm_lm_split(model:nn.Module) -> List[nn.Module]: |
| "Split a RNN `model` in groups for differential learning rates." |
| groups = [[rnn, dp] for rnn, dp in zip(model[0].rnns, model[0].hidden_dps)] |
| return groups + [[model[0].encoder, model[0].encoder_dp, model[1]]] |
|
|
| def awd_lstm_clas_split(model:nn.Module) -> List[nn.Module]: |
| "Split a RNN `model` in groups for differential learning rates." |
| groups = [[model[0].module.encoder, model[0].module.encoder_dp]] |
| groups += [[rnn, dp] for rnn, dp in zip(model[0].module.rnns, model[0].module.hidden_dps)] |
| return groups + [[model[1]]] |
|
|
| awd_lstm_lm_config = dict(emb_sz=400, n_hid=1152, n_layers=3, pad_token=1, qrnn=False, bidir=False, output_p=0.1, |
| hidden_p=0.15, input_p=0.25, embed_p=0.02, weight_p=0.2, tie_weights=True, out_bias=True) |
|
|
| awd_lstm_clas_config = dict(emb_sz=400, n_hid=1152, n_layers=3, pad_token=1, qrnn=False, bidir=False, output_p=0.4, |
| hidden_p=0.3, input_p=0.4, embed_p=0.05, weight_p=0.5) |
|
|
| def value2rgba(x:float, cmap:Callable=cm.RdYlGn, alpha_mult:float=1.0)->Tuple: |
| "Convert a value `x` from 0 to 1 (inclusive) to an RGBA tuple according to `cmap` times transparency `alpha_mult`." |
| c = cmap(x) |
| rgb = (np.array(c[:-1]) * 255).astype(int) |
| a = c[-1] * alpha_mult |
| return tuple(rgb.tolist() + [a]) |
|
|
| def piece_attn_html(pieces:List[str], attns:List[float], sep:str=' ', **kwargs)->str: |
| html_code,spans = ['<span style="font-family: monospace;">'], [] |
| for p, a in zip(pieces, attns): |
| p = html.escape(p) |
| c = str(value2rgba(a, alpha_mult=0.5, **kwargs)) |
| spans.append(f'<span title="{a:.3f}" style="background-color: rgba{c};">{p}</span>') |
| html_code.append(sep.join(spans)) |
| html_code.append('</span>') |
| return ''.join(html_code) |
|
|
| def show_piece_attn(*args, **kwargs): |
| from IPython.display import display, HTML |
| display(HTML(piece_attn_html(*args, **kwargs))) |
|
|
| def _eval_dropouts(mod): |
| module_name = mod.__class__.__name__ |
| if 'Dropout' in module_name or 'BatchNorm' in module_name: mod.training = False |
| for module in mod.children(): _eval_dropouts(module) |
| |
| class TextClassificationInterpretation(ClassificationInterpretation): |
| """Provides an interpretation of classification based on input sensitivity. |
| This was designed for AWD-LSTM only for the moment, because Transformer already has its own attentional model. |
| """ |
|
|
| def __init__(self, learn: Learner, preds: Tensor, y_true: Tensor, losses: Tensor, ds_type: DatasetType = DatasetType.Valid): |
| super().__init__(learn,preds,y_true,losses,ds_type) |
| self.model = learn.model |
|
|
| def intrinsic_attention(self, text:str, class_id:int=None): |
| """Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`. |
| For reference, see the Sequential Jacobian session at https://www.cs.toronto.edu/~graves/preprint.pdf |
| """ |
| self.model.train() |
| _eval_dropouts(self.model) |
| self.model.zero_grad() |
| self.model.reset() |
| ids = self.data.one_item(text)[0] |
| emb = self.model[0].module.encoder(ids).detach().requires_grad_(True) |
| lstm_output = self.model[0].module(emb, from_embeddings=True) |
| self.model.eval() |
| cl = self.model[1](lstm_output + (torch.zeros_like(ids).byte(),))[0].softmax(dim=-1) |
| if class_id is None: class_id = cl.argmax() |
| cl[0][class_id].backward() |
| attn = emb.grad.squeeze().abs().sum(dim=-1) |
| attn /= attn.max() |
| tokens = self.data.single_ds.reconstruct(ids[0]) |
| return tokens, attn |
|
|
| def html_intrinsic_attention(self, text:str, class_id:int=None, **kwargs)->str: |
| text, attn = self.intrinsic_attention(text, class_id) |
| return piece_attn_html(text.text.split(), to_np(attn), **kwargs) |
|
|
| def show_intrinsic_attention(self, text:str, class_id:int=None, **kwargs)->None: |
| text, attn = self.intrinsic_attention(text, class_id) |
| show_piece_attn(text.text.split(), to_np(attn), **kwargs) |
|
|
| def show_top_losses(self, k:int, max_len:int=70)->None: |
| """ |
| Create a tabulation showing the first `k` texts in top_losses along with their prediction, actual,loss, and probability of |
| actual class. `max_len` is the maximum number of tokens displayed. |
| """ |
| from IPython.display import display, HTML |
| items = [] |
| tl_val,tl_idx = self.top_losses() |
| for i,idx in enumerate(tl_idx): |
| if k <= 0: break |
| k -= 1 |
| tx,cl = self.data.dl(self.ds_type).dataset[idx] |
| cl = cl.data |
| classes = self.data.classes |
| txt = ' '.join(tx.text.split(' ')[:max_len]) if max_len is not None else tx.text |
| tmp = [txt, f'{classes[self.pred_class[idx]]}', f'{classes[cl]}', f'{self.losses[idx]:.2f}', |
| f'{self.preds[idx][cl]:.2f}'] |
| items.append(tmp) |
| items = np.array(items) |
| names = ['Text', 'Prediction', 'Actual', 'Loss', 'Probability'] |
| df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names) |
| with pd.option_context('display.max_colwidth', -1): |
| display(HTML(df.to_html(index=False))) |
|
|