| """ |
| One-time script to remove entries exceeding the OpenAI embedding token limit |
| from the benchmark dataset. |
| """ |
|
|
| import json |
|
|
| |
| REFS_TO_REMOVE = [ |
| "Shemot Rabbah.1:1", |
| "Bamidbar Rabbah.1:2", |
| "Bamidbar Rabbah.2:10", |
| "Shir HaShirim Rabbah.1.1:10", |
| "Eichah Rabbah.1:4", |
| "Eichah Rabbah.1:23", |
| "Eichah Rabbah.1:31", |
| "Ramban on Genesis.18:1", |
| "Ramban on Genesis.24:2", |
| "Ramban on Leviticus.1.9:1", |
| "Ramban on Numbers.16:1", |
| "Ramban on Numbers.24:1", |
| "Ramban on Deuteronomy.2.23:1", |
| ] |
|
|
| def main(): |
| data_path = "benchmark_data/benchmark.json" |
| |
| |
| print(f"Loading data from: {data_path}") |
| with open(data_path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| |
| original_count = len(data) |
| print(f"Original entry count: {original_count}") |
| |
| |
| filtered_data = [entry for entry in data if entry["ref"] not in REFS_TO_REMOVE] |
| |
| removed_count = original_count - len(filtered_data) |
| print(f"Removed {removed_count} entries") |
| print(f"New entry count: {len(filtered_data)}") |
| |
| |
| print(f"Saving filtered data to: {data_path}") |
| with open(data_path, "w", encoding="utf-8") as f: |
| json.dump(filtered_data, f, ensure_ascii=False, indent=2) |
| |
| print("Done!") |
|
|
| if __name__ == "__main__": |
| main() |
|
|