File size: 9,447 Bytes
d670799
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, List, Optional, Sequence, Tuple, Union

import mmengine
import numpy as np
from mmengine.infer import BaseInferencer
from mmengine.structures import InstanceData

from mmaction.utils import ConfigType
from .actionrecog_inferencer import ActionRecogInferencer

InstanceList = List[InstanceData]
InputType = Union[str, np.ndarray]
InputsType = Union[InputType, Sequence[InputType]]
PredType = Union[InstanceData, InstanceList]
ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]


class MMAction2Inferencer(BaseInferencer):
    """MMAction2 Inferencer. It's a unified inferencer interface for video

    analyse task, currently including: ActionRecog. and it can be used to

    perform end-to-end action recognition inference.



    Args:

        rec (str, optional): Pretrained action recognition algorithm.

            It's the path to the config file or the model name defined in

            metafile. For example, it could be:



            - model alias, e.g. ``'slowfast'``,

            - config name, e.g. ``'slowfast_r50_8xb8-8x8x1-256e_kinetics400

                -rgb'``,

            - config path



            Defaults to ``None``.

        rec_weights (str, optional): Path to the custom checkpoint file of

            the selected rec model. If it is not specified and "rec" is a model

            name of metafile, the weights will be loaded from metafile.

            Defaults to None.

        device (str, optional): Device to run inference. For example,

            it could be 'cuda' or 'cpu'. If None, the available

            device will be automatically used. Defaults to None.

        label_file (str, optional): label file for dataset.

        input_format (str): Input video format, Choices are 'video',

            'rawframes', 'array'. 'video' means input data is a video file,

            'rawframes' means input data is a video frame folder, and 'array'

            means input data is a np.ndarray. Defaults to 'video'.

    """

    preprocess_kwargs: set = set()
    forward_kwargs: set = set()
    visualize_kwargs: set = {
        'return_vis', 'show', 'wait_time', 'vid_out_dir', 'draw_pred', 'fps',
        'out_type', 'target_resolution'
    }
    postprocess_kwargs: set = {
        'print_result', 'pred_out_file', 'return_datasample'
    }

    def __init__(self,

                 rec: Optional[str] = None,

                 rec_weights: Optional[str] = None,

                 device: Optional[str] = None,

                 label_file: Optional[str] = None,

                 input_format: str = 'video') -> None:

        if rec is None:
            raise ValueError('rec algorithm should provided.')

        self.visualizer = None
        self.num_visualized_imgs = 0

        if rec is not None:
            self.actionrecog_inferencer = ActionRecogInferencer(
                rec, rec_weights, device, label_file, input_format)
            self.mode = 'rec'

    def _init_pipeline(self, cfg: ConfigType) -> None:
        pass

    def forward(self, inputs: InputType, batch_size: int,

                **forward_kwargs) -> PredType:
        """Forward the inputs to the model.



        Args:

            inputs (InputsType): The inputs to be forwarded.

            batch_size (int): Batch size. Defaults to 1.



        Returns:

            Dict: The prediction results. Possibly with keys "rec".

        """
        result = {}
        if self.mode == 'rec':
            predictions = self.actionrecog_inferencer(
                inputs,
                return_datasamples=True,
                batch_size=batch_size,
                **forward_kwargs)['predictions']
            result['rec'] = [[p] for p in predictions]

        return result

    def visualize(self, inputs: InputsType, preds: PredType,

                  **kwargs) -> List[np.ndarray]:
        """Visualize predictions.



        Args:

            inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.

            preds (List[Dict]): Predictions of the model.

            show (bool): Whether to display the image in a popup window.

                Defaults to False.

            wait_time (float): The interval of show (s). Defaults to 0.

            draw_pred (bool): Whether to draw predicted bounding boxes.

                Defaults to True.

            fps (int): Frames per second for saving video. Defaults to 4.

            out_type (str): Output format type, choose from 'img', 'gif',

                'video'. Defaults to ``'img'``.

            target_resolution (Tuple[int], optional): Set to

                (desired_width desired_height) to have resized frames. If

                either dimension is None, the frames are resized by keeping

                the existing aspect ratio. Defaults to None.

            vid_out_dir (str): Output directory of visualization results.

                If left as empty, no file will be saved. Defaults to ''.

        """

        if 'rec' in self.mode:
            return self.actionrecog_inferencer.visualize(
                inputs, preds['rec'][0], **kwargs)

    def __call__(

        self,

        inputs: InputsType,

        batch_size: int = 1,

        **kwargs,

    ) -> dict:
        """Call the inferencer.



        Args:

            inputs (InputsType): Inputs for the inferencer. It can be a path

                to image / image directory, or an array, or a list of these.

            return_datasamples (bool): Whether to return results as

                :obj:`BaseDataElement`. Defaults to False.

            batch_size (int): Batch size. Defaults to 1.

            **kwargs: Key words arguments passed to :meth:`preprocess`,

                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.

                Each key in kwargs should be in the corresponding set of

                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``

                and ``postprocess_kwargs``.



        Returns:

            dict: Inference and visualization results.

        """
        (
            preprocess_kwargs,
            forward_kwargs,
            visualize_kwargs,
            postprocess_kwargs,
        ) = self._dispatch_kwargs(**kwargs)

        ori_inputs = self._inputs_to_list(inputs)

        preds = self.forward(ori_inputs, batch_size, **forward_kwargs)

        visualization = self.visualize(
            ori_inputs, preds,
            **visualize_kwargs)  # type: ignore  # noqa: E501
        results = self.postprocess(preds, visualization, **postprocess_kwargs)
        return results

    def _inputs_to_list(self, inputs: InputsType) -> list:
        """Preprocess the inputs to a list. The main difference from mmengine

        version is that we don't list a directory cause input could be a frame

        folder.



        Preprocess inputs to a list according to its type:



        - list or tuple: return inputs

        - str: return a list containing the string. The string

              could be a path to file, a url or other types of string according

              to the task.



        Args:

            inputs (InputsType): Inputs for the inferencer.



        Returns:

            list: List of input for the :meth:`preprocess`.

        """
        if not isinstance(inputs, (list, tuple)):
            inputs = [inputs]

        return list(inputs)

    def postprocess(self,

                    preds: PredType,

                    visualization: Optional[List[np.ndarray]] = None,

                    print_result: bool = False,

                    pred_out_file: str = ''

                    ) -> Union[ResType, Tuple[ResType, np.ndarray]]:
        """Postprocess predictions.



        Args:

            preds (Dict): Predictions of the model.

            visualization (Optional[np.ndarray]): Visualized predictions.

            print_result (bool): Whether to print the result.

                Defaults to False.

            pred_out_file (str): Output file name to store predictions

                without images. Supported file formats are “json”, “yaml/yml”

                and “pickle/pkl”. Defaults to ''.



        Returns:

            Dict or List[Dict]: Each dict contains the inference result of

            each image. Possible keys are "rec_labels", "rec_scores"

        """

        result_dict = {}
        pred_results = [{} for _ in range(len(next(iter(preds.values()))))]
        if 'rec' in self.mode:
            for i, rec_pred in enumerate(preds['rec']):
                result = dict(rec_labels=[], rec_scores=[])
                for rec_pred_instance in rec_pred:
                    rec_dict_res = self.actionrecog_inferencer.pred2dict(
                        rec_pred_instance)
                    result['rec_labels'].append(rec_dict_res['pred_labels'])
                    result['rec_scores'].append(rec_dict_res['pred_scores'])
                pred_results[i].update(result)

        result_dict['predictions'] = pred_results
        if print_result:
            print(result_dict)
        if pred_out_file != '':
            mmengine.dump(result_dict, pred_out_file)
        result_dict['visualization'] = visualization
        return result_dict