File size: 2,578 Bytes

7934b29

# Copyright 2022 The Google AI Language Team Authors and
# The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from typing import Dict, List, Optional

from nemo.core.classes import Dataset
from nemo.core.neural_types import NeuralType, StringLabel, StringType

__all__ = ['PTuneTextClassificationDataset', 'token_wrapper']


def load_file(filename):
    data = []
    with open(filename, "r") as f:
        for line in f.readlines():
            data.append(json.loads(line))
    return data


def token_wrapper(token: str) -> str:
    return 'Ġ' + token


class PTuneTextClassificationDataset(Dataset):
    @property
    def output_types(self) -> Optional[Dict[str, NeuralType]]:
        return {"sentences": [NeuralType(('T'), StringType())], "labels": [NeuralType(('T'), StringLabel())]}

    def __init__(self, input_file: str, queries: List[str] = None, prompt: str = 'Sentiment'):
        """
        A dataset class that feed data for P-tuning model
        Args:
            input_file: loose json data file. The format is {"sentence":"input sentence", "label":"class label"}
            queries: list of query input sentences
            prompt: the prompt string appended at the end of your input sentence
        """
        super().__init__()
        if input_file and not os.path.exists(input_file):
            raise FileNotFoundError(
                f'Data file `{input_file}` not found! Each line of the data file should contain json object'
                f'where `sentence` key maps to sentence and `label` key maps to label'
            )
        if queries is None:
            json_data = load_file(input_file)
        else:
            json_data = []
            for line in queries:
                json_data.append({'sentence': line + f' {prompt} ', 'label': ''})
        self.data = json_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]['sentence'], self.data[i]['label']