Rabbinic-Embedding-Bench / remove_oversize_entries.py
Lev Israel
Initial Commit
018c4c5
"""
One-time script to remove entries exceeding the OpenAI embedding token limit
from the benchmark dataset.
"""
import json
# The refs to remove (from the token limit check report)
REFS_TO_REMOVE = [
"Shemot Rabbah.1:1",
"Bamidbar Rabbah.1:2",
"Bamidbar Rabbah.2:10",
"Shir HaShirim Rabbah.1.1:10",
"Eichah Rabbah.1:4",
"Eichah Rabbah.1:23",
"Eichah Rabbah.1:31",
"Ramban on Genesis.18:1",
"Ramban on Genesis.24:2",
"Ramban on Leviticus.1.9:1",
"Ramban on Numbers.16:1",
"Ramban on Numbers.24:1",
"Ramban on Deuteronomy.2.23:1",
]
def main():
data_path = "benchmark_data/benchmark.json"
# Load the data
print(f"Loading data from: {data_path}")
with open(data_path, "r", encoding="utf-8") as f:
data = json.load(f)
original_count = len(data)
print(f"Original entry count: {original_count}")
# Filter out the flagged entries
filtered_data = [entry for entry in data if entry["ref"] not in REFS_TO_REMOVE]
removed_count = original_count - len(filtered_data)
print(f"Removed {removed_count} entries")
print(f"New entry count: {len(filtered_data)}")
# Save the filtered data
print(f"Saving filtered data to: {data_path}")
with open(data_path, "w", encoding="utf-8") as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
print("Done!")
if __name__ == "__main__":
main()