File size: 3,530 Bytes
c7a6fe6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | import os
import json
import datasets
import argparse
from verl.utils.hdfs_io import copy, makedirs
# 1. Define the exact Prompt Template from your requirements
# /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt
with open("/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt", 'r') as f:
PROMPT_TEMPLATE = f.read()
def make_map_fn(split, data_source):
def process_fn(example, idx):
# Extract fields from your specific JSON keys
full_text = example.pop('fulltext')
gold_summary = example.pop('summary')
fulltext_subclaims = example.pop('fulltext_subclaims', None)
summary_subclaims = example.pop('summary_subclaims', None)
# Format the prompt using your template
# Note: Added 'English' as default source lang based on filename
prompt_content = PROMPT_TEMPLATE.format(
source_lang="English",
gold_summary=gold_summary,
full_text=full_text
)
return {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": prompt_content
}],
"ability": "summarization",
"reward_model": {
"style": "rule",
"ground_truth": {
"summary_subclaims": summary_subclaims,
"fulltext_subclaims": fulltext_subclaims
}
},
"extra_info": {
"split": split,
"index": idx,
"original_id": example.get('id', idx),
"fulltext_subclaims": fulltext_subclaims,
"summary_subclaims": summary_subclaims
}
}
return process_fn
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Path to your input JSON
parser.add_argument('--input_path', default='/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json')
# Updated destination as requested
parser.add_argument('--local_dir', default='/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset')
args = parser.parse_args()
data_source = 'multiclinsum'
# Load your local JSON file
with open(args.input_path, 'r') as f:
raw_data = json.load(f)
# Convert to HuggingFace Dataset
dataset = datasets.Dataset.from_list(raw_data)
# Split into train/test (95%/5%)
dataset_split = dataset.train_test_split(test_size=0.05, seed=42, shuffle=True)
# Apply the mapping transformation per split
processed_train = dataset_split['train'].map(
function=make_map_fn('train', data_source),
with_indices=True
)
processed_test = dataset_split['test'].map(
function=make_map_fn('test', data_source),
with_indices=True
)
# Create the directory if it doesn't exist
os.makedirs(args.local_dir, exist_ok=True)
# Save to Parquet in the specified location
train_output_path = os.path.join(args.local_dir, 'train.parquet')
test_output_path = os.path.join(args.local_dir, 'test.parquet')
processed_train.to_parquet(train_output_path)
processed_test.to_parquet(test_output_path)
print(f"--- Dataset Preparation Complete ---")
print(f"Train file saved to: {train_output_path}")
print(f"Test file saved to: {test_output_path}")
print(f"Total train records: {len(processed_train)}")
print(f"Total test records: {len(processed_test)}") |