|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
import os |
|
|
from typing import Dict, List, Optional |
|
|
|
|
|
from nemo.core.classes import Dataset |
|
|
from nemo.core.neural_types import NeuralType, StringLabel, StringType |
|
|
|
|
|
__all__ = ['PTuneTextClassificationDataset', 'token_wrapper'] |
|
|
|
|
|
|
|
|
def load_file(filename): |
|
|
data = [] |
|
|
with open(filename, "r") as f: |
|
|
for line in f.readlines(): |
|
|
data.append(json.loads(line)) |
|
|
return data |
|
|
|
|
|
|
|
|
def token_wrapper(token: str) -> str: |
|
|
return 'Ġ' + token |
|
|
|
|
|
|
|
|
class PTuneTextClassificationDataset(Dataset): |
|
|
@property |
|
|
def output_types(self) -> Optional[Dict[str, NeuralType]]: |
|
|
return {"sentences": [NeuralType(('T'), StringType())], "labels": [NeuralType(('T'), StringLabel())]} |
|
|
|
|
|
def __init__(self, input_file: str, queries: List[str] = None, prompt: str = 'Sentiment'): |
|
|
""" |
|
|
A dataset class that feed data for P-tuning model |
|
|
Args: |
|
|
input_file: loose json data file. The format is {"sentence":"input sentence", "label":"class label"} |
|
|
queries: list of query input sentences |
|
|
prompt: the prompt string appended at the end of your input sentence |
|
|
""" |
|
|
super().__init__() |
|
|
if input_file and not os.path.exists(input_file): |
|
|
raise FileNotFoundError( |
|
|
f'Data file `{input_file}` not found! Each line of the data file should contain json object' |
|
|
f'where `sentence` key maps to sentence and `label` key maps to label' |
|
|
) |
|
|
if queries is None: |
|
|
json_data = load_file(input_file) |
|
|
else: |
|
|
json_data = [] |
|
|
for line in queries: |
|
|
json_data.append({'sentence': line + f' {prompt} ', 'label': ''}) |
|
|
self.data = json_data |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.data) |
|
|
|
|
|
def __getitem__(self, i): |
|
|
return self.data[i]['sentence'], self.data[i]['label'] |
|
|
|