Spaces:
Runtime error
Runtime error
| # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| This code is refer from: | |
| https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/channel_reduction_encoder.py | |
| https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/robust_scanner_decoder.py | |
| """ | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| import math | |
| import paddle | |
| from paddle import ParamAttr | |
| import paddle.nn as nn | |
| import paddle.nn.functional as F | |
| class BaseDecoder(nn.Layer): | |
| def __init__(self, **kwargs): | |
| super().__init__() | |
| def forward_train(self, feat, out_enc, targets, img_metas): | |
| raise NotImplementedError | |
| def forward_test(self, feat, out_enc, img_metas): | |
| raise NotImplementedError | |
| def forward(self, | |
| feat, | |
| out_enc, | |
| label=None, | |
| valid_ratios=None, | |
| word_positions=None, | |
| train_mode=True): | |
| self.train_mode = train_mode | |
| if train_mode: | |
| return self.forward_train(feat, out_enc, label, valid_ratios, word_positions) | |
| return self.forward_test(feat, out_enc, valid_ratios, word_positions) | |
| class ChannelReductionEncoder(nn.Layer): | |
| """Change the channel number with a one by one convoluational layer. | |
| Args: | |
| in_channels (int): Number of input channels. | |
| out_channels (int): Number of output channels. | |
| """ | |
| def __init__(self, | |
| in_channels, | |
| out_channels, | |
| **kwargs): | |
| super(ChannelReductionEncoder, self).__init__() | |
| self.layer = nn.Conv2D( | |
| in_channels, out_channels, kernel_size=1, stride=1, padding=0, weight_attr=nn.initializer.XavierNormal()) | |
| def forward(self, feat): | |
| """ | |
| Args: | |
| feat (Tensor): Image features with the shape of | |
| :math:`(N, C_{in}, H, W)`. | |
| Returns: | |
| Tensor: A tensor of shape :math:`(N, C_{out}, H, W)`. | |
| """ | |
| return self.layer(feat) | |
| def masked_fill(x, mask, value): | |
| y = paddle.full(x.shape, value, x.dtype) | |
| return paddle.where(mask, y, x) | |
| class DotProductAttentionLayer(nn.Layer): | |
| def __init__(self, dim_model=None): | |
| super().__init__() | |
| self.scale = dim_model**-0.5 if dim_model is not None else 1. | |
| def forward(self, query, key, value, h, w, valid_ratios=None): | |
| query = paddle.transpose(query, (0, 2, 1)) | |
| logits = paddle.matmul(query, key) * self.scale | |
| n, c, t = logits.shape | |
| # reshape to (n, c, h, w) | |
| logits = paddle.reshape(logits, [n, c, h, w]) | |
| if valid_ratios is not None: | |
| # cal mask of attention weight | |
| with paddle.fluid.framework._stride_in_no_check_dy2st_diff(): | |
| for i, valid_ratio in enumerate(valid_ratios): | |
| valid_width = min(w, int(w * valid_ratio + 0.5)) | |
| if valid_width < w: | |
| logits[i, :, :, valid_width:] = float('-inf') | |
| # reshape to (n, c, h, w) | |
| logits = paddle.reshape(logits, [n, c, t]) | |
| weights = F.softmax(logits, axis=2) | |
| value = paddle.transpose(value, (0, 2, 1)) | |
| glimpse = paddle.matmul(weights, value) | |
| glimpse = paddle.transpose(glimpse, (0, 2, 1)) | |
| return glimpse | |
| class SequenceAttentionDecoder(BaseDecoder): | |
| """Sequence attention decoder for RobustScanner. | |
| RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for | |
| Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_ | |
| Args: | |
| num_classes (int): Number of output classes :math:`C`. | |
| rnn_layers (int): Number of RNN layers. | |
| dim_input (int): Dimension :math:`D_i` of input vector ``feat``. | |
| dim_model (int): Dimension :math:`D_m` of the model. Should also be the | |
| same as encoder output vector ``out_enc``. | |
| max_seq_len (int): Maximum output sequence length :math:`T`. | |
| start_idx (int): The index of `<SOS>`. | |
| mask (bool): Whether to mask input features according to | |
| ``img_meta['valid_ratio']``. | |
| padding_idx (int): The index of `<PAD>`. | |
| dropout (float): Dropout rate. | |
| return_feature (bool): Return feature or logits as the result. | |
| encode_value (bool): Whether to use the output of encoder ``out_enc`` | |
| as `value` of attention layer. If False, the original feature | |
| ``feat`` will be used. | |
| Warning: | |
| This decoder will not predict the final class which is assumed to be | |
| `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>` | |
| is also ignored by loss as specified in | |
| :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`. | |
| """ | |
| def __init__(self, | |
| num_classes=None, | |
| rnn_layers=2, | |
| dim_input=512, | |
| dim_model=128, | |
| max_seq_len=40, | |
| start_idx=0, | |
| mask=True, | |
| padding_idx=None, | |
| dropout=0, | |
| return_feature=False, | |
| encode_value=False): | |
| super().__init__() | |
| self.num_classes = num_classes | |
| self.dim_input = dim_input | |
| self.dim_model = dim_model | |
| self.return_feature = return_feature | |
| self.encode_value = encode_value | |
| self.max_seq_len = max_seq_len | |
| self.start_idx = start_idx | |
| self.mask = mask | |
| self.embedding = nn.Embedding( | |
| self.num_classes, self.dim_model, padding_idx=padding_idx) | |
| self.sequence_layer = nn.LSTM( | |
| input_size=dim_model, | |
| hidden_size=dim_model, | |
| num_layers=rnn_layers, | |
| time_major=False, | |
| dropout=dropout) | |
| self.attention_layer = DotProductAttentionLayer() | |
| self.prediction = None | |
| if not self.return_feature: | |
| pred_num_classes = num_classes - 1 | |
| self.prediction = nn.Linear( | |
| dim_model if encode_value else dim_input, pred_num_classes) | |
| def forward_train(self, feat, out_enc, targets, valid_ratios): | |
| """ | |
| Args: | |
| feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. | |
| out_enc (Tensor): Encoder output of shape | |
| :math:`(N, D_m, H, W)`. | |
| targets (Tensor): a tensor of shape :math:`(N, T)`. Each element is the index of a | |
| character. | |
| valid_ratios (Tensor): valid length ratio of img. | |
| Returns: | |
| Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if | |
| ``return_feature=False``. Otherwise it would be the hidden feature | |
| before the prediction projection layer, whose shape is | |
| :math:`(N, T, D_m)`. | |
| """ | |
| tgt_embedding = self.embedding(targets) | |
| n, c_enc, h, w = out_enc.shape | |
| assert c_enc == self.dim_model | |
| _, c_feat, _, _ = feat.shape | |
| assert c_feat == self.dim_input | |
| _, len_q, c_q = tgt_embedding.shape | |
| assert c_q == self.dim_model | |
| assert len_q <= self.max_seq_len | |
| query, _ = self.sequence_layer(tgt_embedding) | |
| query = paddle.transpose(query, (0, 2, 1)) | |
| key = paddle.reshape(out_enc, [n, c_enc, h * w]) | |
| if self.encode_value: | |
| value = key | |
| else: | |
| value = paddle.reshape(feat, [n, c_feat, h * w]) | |
| attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) | |
| attn_out = paddle.transpose(attn_out, (0, 2, 1)) | |
| if self.return_feature: | |
| return attn_out | |
| out = self.prediction(attn_out) | |
| return out | |
| def forward_test(self, feat, out_enc, valid_ratios): | |
| """ | |
| Args: | |
| feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. | |
| out_enc (Tensor): Encoder output of shape | |
| :math:`(N, D_m, H, W)`. | |
| valid_ratios (Tensor): valid length ratio of img. | |
| Returns: | |
| Tensor: The output logit sequence tensor of shape | |
| :math:`(N, T, C-1)`. | |
| """ | |
| seq_len = self.max_seq_len | |
| batch_size = feat.shape[0] | |
| decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx) | |
| outputs = [] | |
| for i in range(seq_len): | |
| step_out = self.forward_test_step(feat, out_enc, decode_sequence, | |
| i, valid_ratios) | |
| outputs.append(step_out) | |
| max_idx = paddle.argmax(step_out, axis=1, keepdim=False) | |
| if i < seq_len - 1: | |
| decode_sequence[:, i + 1] = max_idx | |
| outputs = paddle.stack(outputs, 1) | |
| return outputs | |
| def forward_test_step(self, feat, out_enc, decode_sequence, current_step, | |
| valid_ratios): | |
| """ | |
| Args: | |
| feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. | |
| out_enc (Tensor): Encoder output of shape | |
| :math:`(N, D_m, H, W)`. | |
| decode_sequence (Tensor): Shape :math:`(N, T)`. The tensor that | |
| stores history decoding result. | |
| current_step (int): Current decoding step. | |
| valid_ratios (Tensor): valid length ratio of img | |
| Returns: | |
| Tensor: Shape :math:`(N, C-1)`. The logit tensor of predicted | |
| tokens at current time step. | |
| """ | |
| embed = self.embedding(decode_sequence) | |
| n, c_enc, h, w = out_enc.shape | |
| assert c_enc == self.dim_model | |
| _, c_feat, _, _ = feat.shape | |
| assert c_feat == self.dim_input | |
| _, _, c_q = embed.shape | |
| assert c_q == self.dim_model | |
| query, _ = self.sequence_layer(embed) | |
| query = paddle.transpose(query, (0, 2, 1)) | |
| key = paddle.reshape(out_enc, [n, c_enc, h * w]) | |
| if self.encode_value: | |
| value = key | |
| else: | |
| value = paddle.reshape(feat, [n, c_feat, h * w]) | |
| # [n, c, l] | |
| attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) | |
| out = attn_out[:, :, current_step] | |
| if self.return_feature: | |
| return out | |
| out = self.prediction(out) | |
| out = F.softmax(out, dim=-1) | |
| return out | |
| class PositionAwareLayer(nn.Layer): | |
| def __init__(self, dim_model, rnn_layers=2): | |
| super().__init__() | |
| self.dim_model = dim_model | |
| self.rnn = nn.LSTM( | |
| input_size=dim_model, | |
| hidden_size=dim_model, | |
| num_layers=rnn_layers, | |
| time_major=False) | |
| self.mixer = nn.Sequential( | |
| nn.Conv2D( | |
| dim_model, dim_model, kernel_size=3, stride=1, padding=1), | |
| nn.ReLU(), | |
| nn.Conv2D( | |
| dim_model, dim_model, kernel_size=3, stride=1, padding=1)) | |
| def forward(self, img_feature): | |
| n, c, h, w = img_feature.shape | |
| rnn_input = paddle.transpose(img_feature, (0, 2, 3, 1)) | |
| rnn_input = paddle.reshape(rnn_input, (n * h, w, c)) | |
| rnn_output, _ = self.rnn(rnn_input) | |
| rnn_output = paddle.reshape(rnn_output, (n, h, w, c)) | |
| rnn_output = paddle.transpose(rnn_output, (0, 3, 1, 2)) | |
| out = self.mixer(rnn_output) | |
| return out | |
| class PositionAttentionDecoder(BaseDecoder): | |
| """Position attention decoder for RobustScanner. | |
| RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for | |
| Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_ | |
| Args: | |
| num_classes (int): Number of output classes :math:`C`. | |
| rnn_layers (int): Number of RNN layers. | |
| dim_input (int): Dimension :math:`D_i` of input vector ``feat``. | |
| dim_model (int): Dimension :math:`D_m` of the model. Should also be the | |
| same as encoder output vector ``out_enc``. | |
| max_seq_len (int): Maximum output sequence length :math:`T`. | |
| mask (bool): Whether to mask input features according to | |
| ``img_meta['valid_ratio']``. | |
| return_feature (bool): Return feature or logits as the result. | |
| encode_value (bool): Whether to use the output of encoder ``out_enc`` | |
| as `value` of attention layer. If False, the original feature | |
| ``feat`` will be used. | |
| Warning: | |
| This decoder will not predict the final class which is assumed to be | |
| `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>` | |
| is also ignored by loss | |
| """ | |
| def __init__(self, | |
| num_classes=None, | |
| rnn_layers=2, | |
| dim_input=512, | |
| dim_model=128, | |
| max_seq_len=40, | |
| mask=True, | |
| return_feature=False, | |
| encode_value=False): | |
| super().__init__() | |
| self.num_classes = num_classes | |
| self.dim_input = dim_input | |
| self.dim_model = dim_model | |
| self.max_seq_len = max_seq_len | |
| self.return_feature = return_feature | |
| self.encode_value = encode_value | |
| self.mask = mask | |
| self.embedding = nn.Embedding(self.max_seq_len + 1, self.dim_model) | |
| self.position_aware_module = PositionAwareLayer( | |
| self.dim_model, rnn_layers) | |
| self.attention_layer = DotProductAttentionLayer() | |
| self.prediction = None | |
| if not self.return_feature: | |
| pred_num_classes = num_classes - 1 | |
| self.prediction = nn.Linear( | |
| dim_model if encode_value else dim_input, pred_num_classes) | |
| def _get_position_index(self, length, batch_size): | |
| position_index_list = [] | |
| for i in range(batch_size): | |
| position_index = paddle.arange(0, end=length, step=1, dtype='int64') | |
| position_index_list.append(position_index) | |
| batch_position_index = paddle.stack(position_index_list, axis=0) | |
| return batch_position_index | |
| def forward_train(self, feat, out_enc, targets, valid_ratios, position_index): | |
| """ | |
| Args: | |
| feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. | |
| out_enc (Tensor): Encoder output of shape | |
| :math:`(N, D_m, H, W)`. | |
| targets (dict): A dict with the key ``padded_targets``, a | |
| tensor of shape :math:`(N, T)`. Each element is the index of a | |
| character. | |
| valid_ratios (Tensor): valid length ratio of img. | |
| position_index (Tensor): The position of each word. | |
| Returns: | |
| Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if | |
| ``return_feature=False``. Otherwise it will be the hidden feature | |
| before the prediction projection layer, whose shape is | |
| :math:`(N, T, D_m)`. | |
| """ | |
| n, c_enc, h, w = out_enc.shape | |
| assert c_enc == self.dim_model | |
| _, c_feat, _, _ = feat.shape | |
| assert c_feat == self.dim_input | |
| _, len_q = targets.shape | |
| assert len_q <= self.max_seq_len | |
| position_out_enc = self.position_aware_module(out_enc) | |
| query = self.embedding(position_index) | |
| query = paddle.transpose(query, (0, 2, 1)) | |
| key = paddle.reshape(position_out_enc, (n, c_enc, h * w)) | |
| if self.encode_value: | |
| value = paddle.reshape(out_enc,(n, c_enc, h * w)) | |
| else: | |
| value = paddle.reshape(feat,(n, c_feat, h * w)) | |
| attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) | |
| attn_out = paddle.transpose(attn_out, (0, 2, 1)) # [n, len_q, dim_v] | |
| if self.return_feature: | |
| return attn_out | |
| return self.prediction(attn_out) | |
| def forward_test(self, feat, out_enc, valid_ratios, position_index): | |
| """ | |
| Args: | |
| feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. | |
| out_enc (Tensor): Encoder output of shape | |
| :math:`(N, D_m, H, W)`. | |
| valid_ratios (Tensor): valid length ratio of img | |
| position_index (Tensor): The position of each word. | |
| Returns: | |
| Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if | |
| ``return_feature=False``. Otherwise it would be the hidden feature | |
| before the prediction projection layer, whose shape is | |
| :math:`(N, T, D_m)`. | |
| """ | |
| n, c_enc, h, w = out_enc.shape | |
| assert c_enc == self.dim_model | |
| _, c_feat, _, _ = feat.shape | |
| assert c_feat == self.dim_input | |
| position_out_enc = self.position_aware_module(out_enc) | |
| query = self.embedding(position_index) | |
| query = paddle.transpose(query, (0, 2, 1)) | |
| key = paddle.reshape(position_out_enc, (n, c_enc, h * w)) | |
| if self.encode_value: | |
| value = paddle.reshape(out_enc,(n, c_enc, h * w)) | |
| else: | |
| value = paddle.reshape(feat,(n, c_feat, h * w)) | |
| attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) | |
| attn_out = paddle.transpose(attn_out, (0, 2, 1)) # [n, len_q, dim_v] | |
| if self.return_feature: | |
| return attn_out | |
| return self.prediction(attn_out) | |
| class RobustScannerFusionLayer(nn.Layer): | |
| def __init__(self, dim_model, dim=-1): | |
| super(RobustScannerFusionLayer, self).__init__() | |
| self.dim_model = dim_model | |
| self.dim = dim | |
| self.linear_layer = nn.Linear(dim_model * 2, dim_model * 2) | |
| def forward(self, x0, x1): | |
| assert x0.shape == x1.shape | |
| fusion_input = paddle.concat([x0, x1], self.dim) | |
| output = self.linear_layer(fusion_input) | |
| output = F.glu(output, self.dim) | |
| return output | |
| class RobustScannerDecoder(BaseDecoder): | |
| """Decoder for RobustScanner. | |
| RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for | |
| Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_ | |
| Args: | |
| num_classes (int): Number of output classes :math:`C`. | |
| dim_input (int): Dimension :math:`D_i` of input vector ``feat``. | |
| dim_model (int): Dimension :math:`D_m` of the model. Should also be the | |
| same as encoder output vector ``out_enc``. | |
| max_seq_len (int): Maximum output sequence length :math:`T`. | |
| start_idx (int): The index of `<SOS>`. | |
| mask (bool): Whether to mask input features according to | |
| ``img_meta['valid_ratio']``. | |
| padding_idx (int): The index of `<PAD>`. | |
| encode_value (bool): Whether to use the output of encoder ``out_enc`` | |
| as `value` of attention layer. If False, the original feature | |
| ``feat`` will be used. | |
| Warning: | |
| This decoder will not predict the final class which is assumed to be | |
| `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>` | |
| is also ignored by loss as specified in | |
| :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`. | |
| """ | |
| def __init__(self, | |
| num_classes=None, | |
| dim_input=512, | |
| dim_model=128, | |
| hybrid_decoder_rnn_layers=2, | |
| hybrid_decoder_dropout=0, | |
| position_decoder_rnn_layers=2, | |
| max_seq_len=40, | |
| start_idx=0, | |
| mask=True, | |
| padding_idx=None, | |
| encode_value=False): | |
| super().__init__() | |
| self.num_classes = num_classes | |
| self.dim_input = dim_input | |
| self.dim_model = dim_model | |
| self.max_seq_len = max_seq_len | |
| self.encode_value = encode_value | |
| self.start_idx = start_idx | |
| self.padding_idx = padding_idx | |
| self.mask = mask | |
| # init hybrid decoder | |
| self.hybrid_decoder = SequenceAttentionDecoder( | |
| num_classes=num_classes, | |
| rnn_layers=hybrid_decoder_rnn_layers, | |
| dim_input=dim_input, | |
| dim_model=dim_model, | |
| max_seq_len=max_seq_len, | |
| start_idx=start_idx, | |
| mask=mask, | |
| padding_idx=padding_idx, | |
| dropout=hybrid_decoder_dropout, | |
| encode_value=encode_value, | |
| return_feature=True | |
| ) | |
| # init position decoder | |
| self.position_decoder = PositionAttentionDecoder( | |
| num_classes=num_classes, | |
| rnn_layers=position_decoder_rnn_layers, | |
| dim_input=dim_input, | |
| dim_model=dim_model, | |
| max_seq_len=max_seq_len, | |
| mask=mask, | |
| encode_value=encode_value, | |
| return_feature=True | |
| ) | |
| self.fusion_module = RobustScannerFusionLayer( | |
| self.dim_model if encode_value else dim_input) | |
| pred_num_classes = num_classes - 1 | |
| self.prediction = nn.Linear(dim_model if encode_value else dim_input, | |
| pred_num_classes) | |
| def forward_train(self, feat, out_enc, target, valid_ratios, word_positions): | |
| """ | |
| Args: | |
| feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. | |
| out_enc (Tensor): Encoder output of shape | |
| :math:`(N, D_m, H, W)`. | |
| target (dict): A dict with the key ``padded_targets``, a | |
| tensor of shape :math:`(N, T)`. Each element is the index of a | |
| character. | |
| valid_ratios (Tensor): | |
| word_positions (Tensor): The position of each word. | |
| Returns: | |
| Tensor: A raw logit tensor of shape :math:`(N, T, C-1)`. | |
| """ | |
| hybrid_glimpse = self.hybrid_decoder.forward_train( | |
| feat, out_enc, target, valid_ratios) | |
| position_glimpse = self.position_decoder.forward_train( | |
| feat, out_enc, target, valid_ratios, word_positions) | |
| fusion_out = self.fusion_module(hybrid_glimpse, position_glimpse) | |
| out = self.prediction(fusion_out) | |
| return out | |
| def forward_test(self, feat, out_enc, valid_ratios, word_positions): | |
| """ | |
| Args: | |
| feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. | |
| out_enc (Tensor): Encoder output of shape | |
| :math:`(N, D_m, H, W)`. | |
| valid_ratios (Tensor): | |
| word_positions (Tensor): The position of each word. | |
| Returns: | |
| Tensor: The output logit sequence tensor of shape | |
| :math:`(N, T, C-1)`. | |
| """ | |
| seq_len = self.max_seq_len | |
| batch_size = feat.shape[0] | |
| decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx) | |
| position_glimpse = self.position_decoder.forward_test( | |
| feat, out_enc, valid_ratios, word_positions) | |
| outputs = [] | |
| for i in range(seq_len): | |
| hybrid_glimpse_step = self.hybrid_decoder.forward_test_step( | |
| feat, out_enc, decode_sequence, i, valid_ratios) | |
| fusion_out = self.fusion_module(hybrid_glimpse_step, | |
| position_glimpse[:, i, :]) | |
| char_out = self.prediction(fusion_out) | |
| char_out = F.softmax(char_out, -1) | |
| outputs.append(char_out) | |
| max_idx = paddle.argmax(char_out, axis=1, keepdim=False) | |
| if i < seq_len - 1: | |
| decode_sequence[:, i + 1] = max_idx | |
| outputs = paddle.stack(outputs, 1) | |
| return outputs | |
| class RobustScannerHead(nn.Layer): | |
| def __init__(self, | |
| out_channels, # 90 + unknown + start + padding | |
| in_channels, | |
| enc_outchannles=128, | |
| hybrid_dec_rnn_layers=2, | |
| hybrid_dec_dropout=0, | |
| position_dec_rnn_layers=2, | |
| start_idx=0, | |
| max_text_length=40, | |
| mask=True, | |
| padding_idx=None, | |
| encode_value=False, | |
| **kwargs): | |
| super(RobustScannerHead, self).__init__() | |
| # encoder module | |
| self.encoder = ChannelReductionEncoder( | |
| in_channels=in_channels, out_channels=enc_outchannles) | |
| # decoder module | |
| self.decoder =RobustScannerDecoder( | |
| num_classes=out_channels, | |
| dim_input=in_channels, | |
| dim_model=enc_outchannles, | |
| hybrid_decoder_rnn_layers=hybrid_dec_rnn_layers, | |
| hybrid_decoder_dropout=hybrid_dec_dropout, | |
| position_decoder_rnn_layers=position_dec_rnn_layers, | |
| max_seq_len=max_text_length, | |
| start_idx=start_idx, | |
| mask=mask, | |
| padding_idx=padding_idx, | |
| encode_value=encode_value) | |
| def forward(self, inputs, targets=None): | |
| ''' | |
| targets: [label, valid_ratio, word_positions] | |
| ''' | |
| out_enc = self.encoder(inputs) | |
| valid_ratios = None | |
| word_positions = targets[-1] | |
| if len(targets) > 1: | |
| valid_ratios = targets[-2] | |
| if self.training: | |
| label = targets[0] # label | |
| label = paddle.to_tensor(label, dtype='int64') | |
| final_out = self.decoder( | |
| inputs, out_enc, label, valid_ratios, word_positions) | |
| if not self.training: | |
| final_out = self.decoder( | |
| inputs, | |
| out_enc, | |
| label=None, | |
| valid_ratios=valid_ratios, | |
| word_positions=word_positions, | |
| train_mode=False) | |
| return final_out | |