Spaces:

Sefaria
/

Rabbinic-Embedding-Bench

Running

Rabbinic-Embedding-Bench / remove_oversize_entries.py

Lev Israel

Initial Commit

018c4c5 3 months ago

1.44 kB

	"""
	One-time script to remove entries exceeding the OpenAI embedding token limit
	from the benchmark dataset.
	"""

	import json

	# The refs to remove (from the token limit check report)
	REFS_TO_REMOVE = [
	"Shemot Rabbah.1:1",
	"Bamidbar Rabbah.1:2",
	"Bamidbar Rabbah.2:10",
	"Shir HaShirim Rabbah.1.1:10",
	"Eichah Rabbah.1:4",
	"Eichah Rabbah.1:23",
	"Eichah Rabbah.1:31",
	"Ramban on Genesis.18:1",
	"Ramban on Genesis.24:2",
	"Ramban on Leviticus.1.9:1",
	"Ramban on Numbers.16:1",
	"Ramban on Numbers.24:1",
	"Ramban on Deuteronomy.2.23:1",
	]

	def main():
	data_path = "benchmark_data/benchmark.json"

	# Load the data
	print(f"Loading data from: {data_path}")
	with open(data_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	original_count = len(data)
	print(f"Original entry count: {original_count}")

	# Filter out the flagged entries
	filtered_data = [entry for entry in data if entry["ref"] not in REFS_TO_REMOVE]

	removed_count = original_count - len(filtered_data)
	print(f"Removed {removed_count} entries")
	print(f"New entry count: {len(filtered_data)}")

	# Save the filtered data
	print(f"Saving filtered data to: {data_path}")
	with open(data_path, "w", encoding="utf-8") as f:
	json.dump(filtered_data, f, ensure_ascii=False, indent=2)

	print("Done!")

	if __name__ == "__main__":
	main()