shahidul034
/

readctrl

Model card Files Files and versions

readctrl / code /RL_model /verl /Search-R1 /dataset /data_prep.py

shahidul034's picture

Add files using upload-large-folder tool

d76c61c verified about 2 months ago

3.17 kB

	import os
	import json
	import datasets
	import argparse
	from verl.utils.hdfs_io import copy, makedirs

	# 1. Define the exact Prompt Template from your requirements
	# /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt
	with open("/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt", 'r') as f:
	PROMPT_TEMPLATE = f.read()

	def make_map_fn(split, data_source):
	def process_fn(example, idx):
	# Extract fields from your specific JSON keys: ['id', 'fulltext', 'summary']
	full_text = example.pop('fulltext')
	gold_summary = example.pop('summary')

	# Format the prompt using your template
	# Note: Added 'English' as default source lang based on filename
	prompt_content = PROMPT_TEMPLATE.format(
	source_lang="English",
	gold_summary=gold_summary,
	full_text=full_text
	)

	return {
	"data_source": data_source,
	"prompt": [{
	"role": "user",
	"content": prompt_content
	}],
	"ability": "summarization",
	"reward_model": {
	"style": "rule",
	"ground_truth": gold_summary
	},
	"extra_info": {
	"split": split,
	"index": idx,
	"original_id": example.get('id', idx)
	}
	}
	return process_fn

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	# Path to your input JSON
	parser.add_argument('--input_path', default='/home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json')
	# Updated destination as requested
	parser.add_argument('--local_dir', default='/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset')
	args = parser.parse_args()

	data_source = 'multiclinsum'

	# Load your local JSON file
	with open(args.input_path, 'r') as f:
	raw_data = json.load(f)

	# Convert to HuggingFace Dataset
	dataset = datasets.Dataset.from_list(raw_data)

	# Split into train/test (95% train, 5% test)
	split_dataset = dataset.train_test_split(test_size=0.05, seed=42)

	# Apply the mapping transformation for each split
	processed_train = split_dataset["train"].map(
	function=make_map_fn('train', data_source),
	with_indices=True
	)
	processed_test = split_dataset["test"].map(
	function=make_map_fn('test', data_source),
	with_indices=True
	)

	# Create the directory if it doesn't exist
	os.makedirs(args.local_dir, exist_ok=True)

	# Save to Parquet in the specified location
	train_output_path = os.path.join(args.local_dir, 'train.parquet')
	test_output_path = os.path.join(args.local_dir, 'test.parquet')
	processed_train.to_parquet(train_output_path)
	processed_test.to_parquet(test_output_path)

	print(f"--- Dataset Preparation Complete ---")
	print(f"Train file saved to: {train_output_path}")
	print(f"Test file saved to: {test_output_path}")
	print(f"Total train records: {len(processed_train)}")
	print(f"Total test records: {len(processed_test)}")