NeMo / nemo /collections /nlp /data /text_classification /ptune_text_classification_dataset.py
camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright 2022 The Google AI Language Team Authors and
# The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from typing import Dict, List, Optional
from nemo.core.classes import Dataset
from nemo.core.neural_types import NeuralType, StringLabel, StringType
__all__ = ['PTuneTextClassificationDataset', 'token_wrapper']
def load_file(filename):
data = []
with open(filename, "r") as f:
for line in f.readlines():
data.append(json.loads(line))
return data
def token_wrapper(token: str) -> str:
return 'Ġ' + token
class PTuneTextClassificationDataset(Dataset):
@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
return {"sentences": [NeuralType(('T'), StringType())], "labels": [NeuralType(('T'), StringLabel())]}
def __init__(self, input_file: str, queries: List[str] = None, prompt: str = 'Sentiment'):
"""
A dataset class that feed data for P-tuning model
Args:
input_file: loose json data file. The format is {"sentence":"input sentence", "label":"class label"}
queries: list of query input sentences
prompt: the prompt string appended at the end of your input sentence
"""
super().__init__()
if input_file and not os.path.exists(input_file):
raise FileNotFoundError(
f'Data file `{input_file}` not found! Each line of the data file should contain json object'
f'where `sentence` key maps to sentence and `label` key maps to label'
)
if queries is None:
json_data = load_file(input_file)
else:
json_data = []
for line in queries:
json_data.append({'sentence': line + f' {prompt} ', 'label': ''})
self.data = json_data
def __len__(self):
return len(self.data)
def __getitem__(self, i):
return self.data[i]['sentence'], self.data[i]['label']