| | import json |
| |
|
| | from datasets import Dataset |
| |
|
| | from opencompass.registry import LOAD_DATASET |
| |
|
| | from .base import BaseDataset |
| |
|
| |
|
| | @LOAD_DATASET.register_module() |
| | class C3Dataset(BaseDataset): |
| |
|
| | @staticmethod |
| | def load(path: str): |
| |
|
| | with open(path, 'r', encoding='utf-8') as f: |
| | data = json.load(f) |
| | rows = [] |
| | for _, row in enumerate(data): |
| | content = row[0] |
| | content_str = ' '.join( |
| | [''.join(paragraph) for paragraph in content]) |
| |
|
| | for question in row[1]: |
| | label = question['choice'].index(question['answer']) |
| | length = len(question['choice']) |
| | if length < 4: |
| | fill_value = question['choice'][0] |
| | fill_count = 4 - length |
| | question['choice'] += [fill_value] * fill_count |
| |
|
| | rows.append({ |
| | 'content': content_str, |
| | 'question': question['question'], |
| | 'choices': question['choice'], |
| | 'choice0': question['choice'][0], |
| | 'choice1': question['choice'][1], |
| | 'choice2': question['choice'][2], |
| | 'choice3': question['choice'][3], |
| | 'label': label |
| | }) |
| |
|
| | dataset = Dataset.from_dict({ |
| | 'content': [row['content'] for row in rows], |
| | 'question': [row['question'] for row in rows], |
| | 'choice0': [row['choice0'] for row in rows], |
| | 'choice1': [row['choice1'] for row in rows], |
| | 'choice2': [row['choice2'] for row in rows], |
| | 'choice3': [row['choice3'] for row in rows], |
| | 'choices': [row['choices'] for row in rows], |
| | 'label': [row['label'] for row in rows] |
| | }) |
| | return dataset |
| |
|
| |
|
| | @LOAD_DATASET.register_module() |
| | class C3Dataset_V2(BaseDataset): |
| |
|
| | @staticmethod |
| | def load(path: str): |
| | with open(path, 'r', encoding='utf-8') as f: |
| | raw = json.load(f) |
| | data = [] |
| | for line in raw: |
| | content = ''.join([''.join(paragraph) for paragraph in line[0]]) |
| | for question in line[1]: |
| | label = question['choice'].index(question['answer']) |
| | label = 'ABCD'[label] |
| | while len(question['choice']) < 4: |
| | question['choice'].append('[NULL]') |
| | data.append({ |
| | 'content': content, |
| | 'question': question['question'], |
| | 'choice0': question['choice'][0], |
| | 'choice1': question['choice'][1], |
| | 'choice2': question['choice'][2], |
| | 'choice3': question['choice'][3], |
| | 'label': label |
| | }) |
| | return Dataset.from_list(data) |
| |
|