Spaces:

JSCPPProgrammer
/

gensearcher-firered

Paused

App Files Files Community

gensearcher-firered / vendor /rllm /scripts /data /dedupe_dataset.py

JSCPPProgrammer

Initial: GenSearcher workflow + FireRed /generate adapter + Gradio

80b7188 verified 2 months ago

raw

history blame contribute delete

4.26 kB

	# https://github.com/huggingface/open-r1/blob/main/scripts/decontaminate.py
	"""
	Usage:

	python scripts/data/dedupe_dataset.py \
	--dedupe_dataset <This is the dataset that gets deduped> \
	--dataset <RAG over this dataset, unmodified> \
	--problem_column <name of column>
	"""

	import json
	import os

	from tqdm import tqdm

	from rllm.data.dataset_types import TrainDataset
	from rllm.data.utils import load_dataset
	from rllm.utils import RAG


	def normalize_string(text: str) -> str:
	"""Basic string normalization."""
	# Convert to lowercase and normalize whitespace
	text = text.lower().strip()
	# Replace multiple spaces with single space
	text = " ".join(text.split())
	return text


	def get_prompt_from_chat_template(text: str) -> str:
	"""Extract the prompt from a chat template."""
	if isinstance(text, str):
	return text
	elif isinstance(text, list):
	return text[0]["content"] if text else ""
	else:
	raise ValueError(f"Unsupported type for text: {type(text)}. Expected str or list.")


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument("--dedupe_dataset", type=str, required=True, help="Path of the first dataset to check for duplicates")
	parser.add_argument("--dataset", type=str, required=True, help="Paths of 2nd dataset to check for duplicates against.")
	parser.add_argument("--problem_column", type=str, default="problem", help="Name of the column containing the problem (prompt).")
	parser.add_argument(
	"--data_dir",
	type=str,
	default=os.path.expanduser("~/rllm/rllm/data/train/code"),
	help="Data directory to save the deduped dataset. If not provided, will use the default data directory.",
	)
	parser.add_argument("--new_dataset_name", type=str, default=None, help="New name for the dataset. If not provided, will reuse the name and add a `_dedupe` to the name.")
	args = parser.parse_args()

	# Load the dataset to check for contamination

	# open dataset from json
	if not os.path.exists(args.data_dir):
	raise ValueError(f"Data directory {args.data_dir} does not exist.")

	# read the dataset as json
	ds_name = TrainDataset.Code[args.dedupe_dataset.upper()]
	orig_ds_name = TrainDataset.Code[args.dataset.upper()]

	ds = load_dataset(ds_name)
	orig_ds = load_dataset(orig_ds_name)

	# get the column as a list
	problem_col = [prob[args.problem_column] for prob in ds]

	# init rag
	rag = RAG(docs=problem_col)

	# loop through the dataset and check for duplicates
	# using the rag
	dupe_idx = set()
	for prob_desc in tqdm(orig_ds, desc="Checking for duplicates"):
	# get the problem description
	desc = prob_desc[args.problem_column]
	if not isinstance(desc, str):
	print(f"Skipping due to non-string question: {desc}")
	continue
	# normalize the question
	normalized_question = normalize_string(desc)
	# check if the question is in the rag
	top_3 = rag.top_k(normalized_question, k=3)

	# loop through the 3 top results and check if the score is greater than 0.95
	if top_3:
	for top in top_3:
	if top["score"] > 0.95:
	dupe_idx.add(top["idx"]) # add the index to the set

	# remove the dupe idx rows from ds

	if dupe_idx:
	print(f"Found {len(dupe_idx)} duplicates in the dataset.")
	ds = [p for i, p in enumerate(ds) if i not in dupe_idx] # remove the duplicates from the dataset
	print(len(ds), "remaining after removing duplicates.")
	else:
	print("No duplicates found.")

	# save the dataset to a new file
	new_ds_name = args.new_dataset_name or f"{os.path.splitext(os.path.basename(args.dedupe_dataset))[0]}_dedupe"
	if not new_ds_name.endswith(".json"):
	new_ds_name += ".json"

	# write to json path
	with open(os.path.join(args.data_dir, new_ds_name), "w", encoding="utf-8") as f:
	json.dump(ds, f)

	print(f"All done! Saving the deduped dataset to {new_ds_name} in {args.data_dir}")