|
|
""" |
|
|
Data loader for Rabbinic Hebrew/Aramaic benchmark texts from Sefaria API. |
|
|
|
|
|
Fetches parallel Hebrew/Aramaic + English text pairs across diverse categories. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
import re |
|
|
import time |
|
|
from pathlib import Path |
|
|
from typing import Optional |
|
|
|
|
|
import requests |
|
|
import tiktoken |
|
|
|
|
|
|
|
|
|
|
|
MAX_EMBEDDING_TOKENS = 8192 |
|
|
_tokenizer = None |
|
|
|
|
|
|
|
|
def get_tokenizer() -> tiktoken.Encoding: |
|
|
"""Get or create the tiktoken encoder (cached for performance).""" |
|
|
global _tokenizer |
|
|
if _tokenizer is None: |
|
|
_tokenizer = tiktoken.get_encoding("cl100k_base") |
|
|
return _tokenizer |
|
|
|
|
|
|
|
|
def count_tokens(text: str) -> int: |
|
|
"""Count the number of tokens in a text string using OpenAI's tokenizer.""" |
|
|
return len(get_tokenizer().encode(text)) |
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_SEFARIA_HOST = "https://www.sefaria.org" |
|
|
SEFARIA_HOST = os.environ.get("SEFARIA_HOST", DEFAULT_SEFARIA_HOST) |
|
|
|
|
|
|
|
|
def set_sefaria_host(host: str) -> None: |
|
|
"""Set the Sefaria host URL (e.g., 'http://localhost:8000').""" |
|
|
global SEFARIA_HOST |
|
|
|
|
|
SEFARIA_HOST = host.rstrip("/") |
|
|
|
|
|
|
|
|
def get_sefaria_host() -> str: |
|
|
"""Get the current Sefaria host URL.""" |
|
|
return SEFARIA_HOST |
|
|
|
|
|
|
|
|
BENCHMARK_TEXTS = { |
|
|
"talmud_bavli": { |
|
|
"category": "Talmud", |
|
|
"language": "Aramaic/Hebrew", |
|
|
"texts": [ |
|
|
"Berakhot", |
|
|
"Pesachim", |
|
|
"Yoma", |
|
|
"Megillah", |
|
|
"Chagigah", |
|
|
"Ketubot", |
|
|
"Gittin", |
|
|
"Bava Metzia", |
|
|
"Sanhedrin", |
|
|
"Avodah Zarah", |
|
|
"Chullin", |
|
|
"Niddah", |
|
|
], |
|
|
}, |
|
|
"talmud_yerushalmi": { |
|
|
"category": "Jerusalem Talmud", |
|
|
"language": "Aramaic/Hebrew", |
|
|
"texts": [ |
|
|
"Jerusalem Talmud Berakhot", |
|
|
"Jerusalem Talmud Kilayim", |
|
|
"Jerusalem Talmud Terumot", |
|
|
"Jerusalem Talmud Shabbat", |
|
|
"Jerusalem Talmud Shekalim", |
|
|
"Jerusalem Talmud Sukkah", |
|
|
"Jerusalem Talmud Sotah", |
|
|
"Jerusalem Talmud Nedarim", |
|
|
"Jerusalem Talmud Kiddushin", |
|
|
"Jerusalem Talmud Bava Kamma", |
|
|
"Jerusalem Talmud Sanhedrin", |
|
|
"Jerusalem Talmud Avodah Zarah", |
|
|
"Jerusalem Talmud Niddah", |
|
|
], |
|
|
}, |
|
|
"mishnah": { |
|
|
"category": "Mishnah", |
|
|
"language": "Rabbinic Hebrew", |
|
|
"texts": [ |
|
|
"Mishnah Berakhot", |
|
|
"Mishnah Peah", |
|
|
"Mishnah Kilayim", |
|
|
"Mishnah Shabbat", |
|
|
"Mishnah Pesachim", |
|
|
"Mishnah Sukkah", |
|
|
"Mishnah Taanit", |
|
|
"Mishnah Chagigah", |
|
|
"Mishnah Yevamot", |
|
|
"Mishnah Sotah", |
|
|
"Mishnah Kiddushin", |
|
|
"Mishnah Bava Kamma", |
|
|
"Mishnah Sanhedrin", |
|
|
"Mishnah Eduyot", |
|
|
"Mishnah Avot", |
|
|
"Mishnah Zevachim", |
|
|
"Mishnah Chullin", |
|
|
"Mishnah Tamid", |
|
|
"Mishnah Kelim", |
|
|
"Mishnah Parah", |
|
|
"Mishnah Niddah", |
|
|
], |
|
|
}, |
|
|
"midrash_rabbah": { |
|
|
"category": "Midrash Rabbah", |
|
|
"language": "Hebrew/Aramaic", |
|
|
"texts": [ |
|
|
"Bereishit Rabbah", |
|
|
"Shemot Rabbah", |
|
|
"Vayikra Rabbah", |
|
|
"Bamidbar Rabbah", |
|
|
"Devarim Rabbah", |
|
|
"Shir HaShirim Rabbah", |
|
|
"Ruth Rabbah", |
|
|
"Eichah Rabbah", |
|
|
"Kohelet Rabbah", |
|
|
"Esther Rabbah", |
|
|
], |
|
|
}, |
|
|
"tanakh_commentary": { |
|
|
"category": "Tanakh Commentary", |
|
|
"language": "Hebrew", |
|
|
"texts": [ |
|
|
"Rashi on Genesis", |
|
|
"Rashi on Exodus", |
|
|
"Rashi on Leviticus", |
|
|
"Rashi on Numbers", |
|
|
"Rashi on Deuteronomy", |
|
|
"Ramban on Genesis", |
|
|
"Ramban on Exodus", |
|
|
"Ramban on Leviticus", |
|
|
"Ramban on Numbers", |
|
|
"Ramban on Deuteronomy", |
|
|
"Radak on Genesis", |
|
|
"Akeidat Yitzchak", |
|
|
"Rabbeinu Behaye, Bereshit", |
|
|
"Rabbeinu Behaye, Shemot", |
|
|
"Rabbeinu Behaye, Vayikra", |
|
|
"Rabbeinu Behaye, Bamidbar", |
|
|
"Rabbeinu Behaye, Devarim", |
|
|
], |
|
|
}, |
|
|
"hasidic_kabbalistic": { |
|
|
"category": "Hasidic/Kabbalistic", |
|
|
"language": "Hebrew", |
|
|
"texts": [ |
|
|
"Likutei Moharan", |
|
|
"Tomer Devorah", |
|
|
"Or Neerav, PART I", |
|
|
"Or Neerav, PART II", |
|
|
"Or Neerav, PART III", |
|
|
"Shekel HaKodesh, On Abstinence", |
|
|
"Shekel HaKodesh, On Wisdom", |
|
|
"Kalach Pitchei Chokhmah", |
|
|
], |
|
|
}, |
|
|
"halacha": { |
|
|
"category": "Halacha", |
|
|
"language": "Hebrew", |
|
|
"texts": [ |
|
|
"Sefer HaChinukh", |
|
|
"Shev Shmateta, Introduction", |
|
|
"Mishneh Torah, Human Dispositions", |
|
|
"Sefer Yesodei HaTorah", |
|
|
], |
|
|
}, |
|
|
"philosophy": { |
|
|
"category": "Philosophy", |
|
|
"language": "Hebrew", |
|
|
"texts": [ |
|
|
"Sefer HaIkkarim, Maamar 1", |
|
|
"Sefer HaIkkarim, Maamar 2", |
|
|
"Sefer HaIkkarim, Maamar 3", |
|
|
"Guide for the Perplexed, Part 1", |
|
|
"Guide for the Perplexed, Part 2", |
|
|
"Guide for the Perplexed, Part 3", |
|
|
], |
|
|
}, |
|
|
"targum": { |
|
|
"category": "Targum", |
|
|
"language": "Aramaic", |
|
|
"texts": [ |
|
|
"Aramaic Targum to Song of Songs", |
|
|
], |
|
|
}, |
|
|
"mussar": { |
|
|
"category": "Mussar/Ethics", |
|
|
"language": "Hebrew", |
|
|
"texts": [ |
|
|
"Iggeret HaRamban", |
|
|
"Shulchan Shel Arba", |
|
|
"Chafetz Chaim", |
|
|
"Yesod HaYirah, On Endurance", |
|
|
"Yesod HaYirah, On Humility", |
|
|
"Kav HaYashar", |
|
|
], |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
def strip_html(text: str) -> str: |
|
|
""" |
|
|
Remove HTML tags from text. |
|
|
|
|
|
Some tags are dropped completely with their content: |
|
|
- <sup class="footnote-marker">...</sup> |
|
|
- <i class="footnote"...>...</i> |
|
|
|
|
|
Other tags are stripped but their inner content is preserved. |
|
|
""" |
|
|
|
|
|
clean = re.sub(r'<sup[^>]*class="footnote-marker"[^>]*>.*?</sup>', '', text, flags=re.DOTALL) |
|
|
|
|
|
|
|
|
|
|
|
clean = _remove_footnote_tags(clean) |
|
|
|
|
|
|
|
|
clean = re.sub(r"<[^>]+>", "", clean) |
|
|
|
|
|
|
|
|
clean = re.sub(r"\s+", " ", clean).strip() |
|
|
return clean |
|
|
|
|
|
|
|
|
def _remove_footnote_tags(text: str) -> str: |
|
|
"""Remove <i class="footnote"...>...</i> tags, handling nested <i> tags.""" |
|
|
result = [] |
|
|
i = 0 |
|
|
|
|
|
while i < len(text): |
|
|
|
|
|
match = re.match(r'<i[^>]*class="footnote"[^>]*>', text[i:], flags=re.IGNORECASE) |
|
|
if match: |
|
|
|
|
|
start = i + match.end() |
|
|
depth = 1 |
|
|
j = start |
|
|
|
|
|
while j < len(text) and depth > 0: |
|
|
if text[j:j+3].lower() == '<i ' or text[j:j+3].lower() == '<i>': |
|
|
depth += 1 |
|
|
j += 1 |
|
|
elif text[j:j+4].lower() == '</i>': |
|
|
depth -= 1 |
|
|
if depth == 0: |
|
|
|
|
|
j += 4 |
|
|
break |
|
|
j += 1 |
|
|
else: |
|
|
j += 1 |
|
|
|
|
|
|
|
|
i = j |
|
|
else: |
|
|
result.append(text[i]) |
|
|
i += 1 |
|
|
|
|
|
return ''.join(result) |
|
|
|
|
|
|
|
|
def extract_bold_only(text: str) -> str: |
|
|
""" |
|
|
Extract only content within <b>...</b> tags, for Talmud Bavli. |
|
|
|
|
|
The Steinsaltz English has bold for actual translation and non-bold for |
|
|
elucidation. We only want the translation. |
|
|
|
|
|
Example: |
|
|
"<b>The Rabbis say:</b> The time for... is <b>until midnight.</b>" |
|
|
-> "The Rabbis say: until midnight." |
|
|
""" |
|
|
|
|
|
bold_parts = re.findall(r'<b>(.*?)</b>', text, flags=re.DOTALL) |
|
|
|
|
|
if not bold_parts: |
|
|
|
|
|
return strip_html(text) |
|
|
|
|
|
|
|
|
cleaned_parts = [strip_html(part) for part in bold_parts] |
|
|
|
|
|
|
|
|
result = ' '.join(cleaned_parts) |
|
|
|
|
|
|
|
|
result = re.sub(r"\s+", " ", result).strip() |
|
|
return result |
|
|
|
|
|
|
|
|
def get_text_from_sefaria(ref: str, retries: int = 3) -> Optional[dict]: |
|
|
""" |
|
|
Fetch a text from Sefaria API. |
|
|
|
|
|
Args: |
|
|
ref: Sefaria reference string (e.g., "Berakhot.2a") |
|
|
retries: Number of retry attempts |
|
|
|
|
|
Returns: |
|
|
Dict with 'he' (Hebrew/Aramaic) and 'en' (English) texts, or None if failed/error |
|
|
""" |
|
|
url = f"{SEFARIA_HOST}/api/texts/{ref}" |
|
|
params = {"context": 0} |
|
|
|
|
|
for attempt in range(retries): |
|
|
try: |
|
|
response = requests.get(url, params=params, timeout=30) |
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
|
|
|
if "error" in data: |
|
|
return None |
|
|
return data |
|
|
elif response.status_code == 429: |
|
|
|
|
|
time.sleep(2 ** attempt) |
|
|
else: |
|
|
return None |
|
|
except requests.RequestException: |
|
|
if attempt < retries - 1: |
|
|
time.sleep(1) |
|
|
continue |
|
|
return None |
|
|
|
|
|
|
|
|
def get_index_from_sefaria(title: str) -> Optional[dict]: |
|
|
""" |
|
|
Get index/structure information for a text. |
|
|
|
|
|
Args: |
|
|
title: The title of the text |
|
|
|
|
|
Returns: |
|
|
Index data or None if failed or text not found |
|
|
""" |
|
|
url = f"{SEFARIA_HOST}/api/index/{title}" |
|
|
try: |
|
|
response = requests.get(url, timeout=30) |
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
|
|
|
if "error" in data: |
|
|
return None |
|
|
return data |
|
|
except requests.RequestException: |
|
|
pass |
|
|
return None |
|
|
|
|
|
|
|
|
def extract_parallel_segments(data: dict, ref: str, category: str = "") -> list[dict]: |
|
|
""" |
|
|
Extract parallel Hebrew/English segments from API response. |
|
|
|
|
|
Args: |
|
|
data: API response data |
|
|
ref: The reference string |
|
|
category: Category name (used for special handling, e.g., "Talmud") |
|
|
|
|
|
Returns: |
|
|
List of dicts with 'ref', 'he', 'en' keys |
|
|
""" |
|
|
segments = [] |
|
|
|
|
|
he_text = data.get("he", []) |
|
|
en_text = data.get("text", []) |
|
|
|
|
|
|
|
|
if he_text and isinstance(he_text, list): |
|
|
|
|
|
if he_text and isinstance(he_text[0], list): |
|
|
he_flat = [] |
|
|
en_flat = [] |
|
|
for i, (he_seg, en_seg) in enumerate(zip(he_text, en_text)): |
|
|
if isinstance(he_seg, list): |
|
|
he_flat.extend(he_seg) |
|
|
en_flat.extend(en_seg if isinstance(en_seg, list) else [en_seg]) |
|
|
else: |
|
|
he_flat.append(he_seg) |
|
|
en_flat.append(en_seg) |
|
|
he_text = he_flat |
|
|
en_text = en_flat |
|
|
|
|
|
|
|
|
if isinstance(he_text, str): |
|
|
he_text = [he_text] |
|
|
if isinstance(en_text, str): |
|
|
en_text = [en_text] |
|
|
|
|
|
|
|
|
is_bavli = category == "Talmud" |
|
|
|
|
|
|
|
|
for i, (he, en) in enumerate(zip(he_text, en_text)): |
|
|
if he and en: |
|
|
he_clean = strip_html(str(he)) if he else "" |
|
|
|
|
|
if is_bavli: |
|
|
en_clean = extract_bold_only(str(en)) if en else "" |
|
|
else: |
|
|
en_clean = strip_html(str(en)) if en else "" |
|
|
|
|
|
|
|
|
if len(he_clean) > 10 and len(en_clean) > 10: |
|
|
|
|
|
he_tokens = count_tokens(he_clean) |
|
|
en_tokens = count_tokens(en_clean) |
|
|
|
|
|
if he_tokens > MAX_EMBEDDING_TOKENS: |
|
|
print(f" Skipping {ref}:{i+1} - Hebrew text exceeds token limit ({he_tokens} > {MAX_EMBEDDING_TOKENS})") |
|
|
continue |
|
|
if en_tokens > MAX_EMBEDDING_TOKENS: |
|
|
print(f" Skipping {ref}:{i+1} - English text exceeds token limit ({en_tokens} > {MAX_EMBEDDING_TOKENS})") |
|
|
continue |
|
|
|
|
|
segments.append({ |
|
|
"ref": f"{ref}:{i+1}" if ":" not in ref else ref, |
|
|
"he": he_clean, |
|
|
"en": en_clean, |
|
|
}) |
|
|
|
|
|
return segments |
|
|
|
|
|
|
|
|
def fetch_text_pairs( |
|
|
text_title: str, |
|
|
category: str, |
|
|
max_segments: int = 500, |
|
|
delay: float = 0.5 |
|
|
) -> list[dict]: |
|
|
""" |
|
|
Fetch parallel text pairs for a given text. |
|
|
|
|
|
Args: |
|
|
text_title: Title of the text to fetch |
|
|
category: Category name for metadata |
|
|
max_segments: Maximum segments to fetch per text |
|
|
delay: Delay between API calls (rate limiting) |
|
|
|
|
|
Returns: |
|
|
List of segment dicts with ref, he, en, category |
|
|
""" |
|
|
pairs = [] |
|
|
|
|
|
|
|
|
index = get_index_from_sefaria(text_title) |
|
|
if not index: |
|
|
print(f" Could not get index for {text_title}") |
|
|
return pairs |
|
|
|
|
|
|
|
|
schema = index.get("schema", {}) |
|
|
|
|
|
|
|
|
if schema.get("nodeType") == "JaggedArrayNode": |
|
|
depth = schema.get("depth", 2) |
|
|
address_types = schema.get("addressTypes", []) |
|
|
|
|
|
|
|
|
uses_talmud_daf = address_types and address_types[0] == "Talmud" |
|
|
|
|
|
if uses_talmud_daf: |
|
|
|
|
|
|
|
|
start_daf = 3 if category == "Jerusalem Talmud" else 2 |
|
|
|
|
|
done = False |
|
|
for daf_num in range(start_daf, 200): |
|
|
if len(pairs) >= max_segments or done: |
|
|
break |
|
|
|
|
|
for side in ["a", "b"]: |
|
|
if len(pairs) >= max_segments: |
|
|
break |
|
|
|
|
|
ref = f"{text_title}.{daf_num}{side}" |
|
|
data = get_text_from_sefaria(ref) |
|
|
|
|
|
|
|
|
if data is None: |
|
|
if side == "a": |
|
|
done = True |
|
|
break |
|
|
|
|
|
if not data.get("he"): |
|
|
continue |
|
|
|
|
|
segments = extract_parallel_segments(data, ref, category) |
|
|
for seg in segments: |
|
|
seg["category"] = category |
|
|
pairs.extend(segments) |
|
|
|
|
|
time.sleep(delay) |
|
|
|
|
|
elif depth == 1: |
|
|
|
|
|
|
|
|
data = get_text_from_sefaria(text_title) |
|
|
if data and data.get("he"): |
|
|
segments = extract_parallel_segments(data, text_title, category) |
|
|
for seg in segments: |
|
|
seg["category"] = category |
|
|
pairs.extend(segments) |
|
|
|
|
|
elif depth == 2: |
|
|
|
|
|
|
|
|
start_chapter = 2 if category == "Mishnah" else 1 |
|
|
consecutive_empty = 0 |
|
|
|
|
|
for chapter in range(start_chapter, 200): |
|
|
if len(pairs) >= max_segments: |
|
|
break |
|
|
|
|
|
ref = f"{text_title}.{chapter}" |
|
|
data = get_text_from_sefaria(ref) |
|
|
|
|
|
|
|
|
if data is None: |
|
|
break |
|
|
|
|
|
|
|
|
if not data.get("he"): |
|
|
consecutive_empty += 1 |
|
|
if consecutive_empty >= 5: |
|
|
break |
|
|
time.sleep(delay) |
|
|
continue |
|
|
|
|
|
consecutive_empty = 0 |
|
|
segments = extract_parallel_segments(data, ref, category) |
|
|
for seg in segments: |
|
|
seg["category"] = category |
|
|
pairs.extend(segments) |
|
|
|
|
|
time.sleep(delay) |
|
|
|
|
|
elif depth >= 3: |
|
|
|
|
|
|
|
|
|
|
|
start_verse = 3 if category == "Jerusalem Talmud" else 1 |
|
|
consecutive_empty_chapters = 0 |
|
|
for chapter in range(1, 200): |
|
|
if len(pairs) >= max_segments: |
|
|
break |
|
|
|
|
|
chapter_had_content = False |
|
|
|
|
|
first_verse = start_verse if chapter == 1 else 1 |
|
|
for verse in range(first_verse, 100): |
|
|
if len(pairs) >= max_segments: |
|
|
break |
|
|
|
|
|
ref = f"{text_title}.{chapter}.{verse}" |
|
|
data = get_text_from_sefaria(ref) |
|
|
|
|
|
|
|
|
if data is None: |
|
|
break |
|
|
|
|
|
|
|
|
if not data.get("he"): |
|
|
continue |
|
|
|
|
|
chapter_had_content = True |
|
|
segments = extract_parallel_segments(data, ref, category) |
|
|
for seg in segments: |
|
|
seg["category"] = category |
|
|
pairs.extend(segments) |
|
|
|
|
|
time.sleep(delay) |
|
|
|
|
|
if not chapter_had_content: |
|
|
consecutive_empty_chapters += 1 |
|
|
if consecutive_empty_chapters >= 5: |
|
|
break |
|
|
else: |
|
|
consecutive_empty_chapters = 0 |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
consecutive_empty = 0 |
|
|
for section in range(1, 1000): |
|
|
if len(pairs) >= max_segments: |
|
|
break |
|
|
|
|
|
ref = f"{text_title}.{section}" |
|
|
data = get_text_from_sefaria(ref) |
|
|
|
|
|
if data is None: |
|
|
break |
|
|
|
|
|
if not data.get("he"): |
|
|
consecutive_empty += 1 |
|
|
if consecutive_empty >= 5: |
|
|
break |
|
|
time.sleep(delay) |
|
|
continue |
|
|
|
|
|
consecutive_empty = 0 |
|
|
segments = extract_parallel_segments(data, ref, category) |
|
|
for seg in segments: |
|
|
seg["category"] = category |
|
|
pairs.extend(segments) |
|
|
|
|
|
time.sleep(delay) |
|
|
|
|
|
|
|
|
if len(pairs) < max_segments: |
|
|
consecutive_empty = 0 |
|
|
for chapter in range(1, 100): |
|
|
if len(pairs) >= max_segments: |
|
|
break |
|
|
|
|
|
chapter_had_content = False |
|
|
for verse in range(1, 50): |
|
|
if len(pairs) >= max_segments: |
|
|
break |
|
|
|
|
|
ref = f"{text_title}.{chapter}.{verse}" |
|
|
data = get_text_from_sefaria(ref) |
|
|
|
|
|
if data is None: |
|
|
break |
|
|
|
|
|
if data.get("he"): |
|
|
chapter_had_content = True |
|
|
consecutive_empty = 0 |
|
|
segments = extract_parallel_segments(data, ref, category) |
|
|
for seg in segments: |
|
|
seg["category"] = category |
|
|
pairs.extend(segments) |
|
|
|
|
|
time.sleep(delay) |
|
|
|
|
|
if not chapter_had_content: |
|
|
consecutive_empty += 1 |
|
|
if consecutive_empty >= 5: |
|
|
break |
|
|
|
|
|
return pairs[:max_segments] |
|
|
|
|
|
|
|
|
def build_benchmark_dataset( |
|
|
output_path: str = "benchmark_data/benchmark.json", |
|
|
segments_per_text: int = 200, |
|
|
total_target: int = 10000, |
|
|
) -> list[dict]: |
|
|
""" |
|
|
Build the full benchmark dataset from all configured texts. |
|
|
|
|
|
Args: |
|
|
output_path: Path to save the benchmark JSON |
|
|
segments_per_text: Target segments per text |
|
|
total_target: Overall target segment count |
|
|
|
|
|
Returns: |
|
|
List of all benchmark pairs |
|
|
""" |
|
|
all_pairs = [] |
|
|
|
|
|
for category_key, category_info in BENCHMARK_TEXTS.items(): |
|
|
category_name = category_info["category"] |
|
|
texts = category_info["texts"] |
|
|
|
|
|
print(f"\n{'='*60}") |
|
|
print(f"Processing category: {category_name}") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
for text_title in texts: |
|
|
if len(all_pairs) >= total_target: |
|
|
break |
|
|
|
|
|
print(f"\nFetching: {text_title}") |
|
|
|
|
|
pairs = fetch_text_pairs( |
|
|
text_title, |
|
|
category_name, |
|
|
max_segments=segments_per_text, |
|
|
) |
|
|
|
|
|
print(f" Got {len(pairs)} pairs") |
|
|
all_pairs.extend(pairs) |
|
|
|
|
|
if len(all_pairs) >= total_target: |
|
|
break |
|
|
|
|
|
|
|
|
output_file = Path(output_path) |
|
|
output_file.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
|
json.dump(all_pairs, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"\n{'='*60}") |
|
|
print(f"Total pairs collected: {len(all_pairs)}") |
|
|
print(f"Saved to: {output_path}") |
|
|
|
|
|
|
|
|
stats = get_benchmark_stats(all_pairs) |
|
|
save_stats_markdown(stats, output_path) |
|
|
|
|
|
return all_pairs |
|
|
|
|
|
|
|
|
def load_benchmark_dataset( |
|
|
source: str = "Sefaria/Rabbinic-Hebrew-English-Pairs", |
|
|
use_local: bool = False, |
|
|
) -> list[dict]: |
|
|
""" |
|
|
Load the benchmark dataset from HuggingFace Hub or local file. |
|
|
|
|
|
Args: |
|
|
source: HuggingFace dataset ID or local file path |
|
|
use_local: If True, load from local JSON file instead of HuggingFace |
|
|
|
|
|
Returns: |
|
|
List of benchmark pairs with keys: ref, he, en, category |
|
|
""" |
|
|
if use_local or source.endswith(".json"): |
|
|
|
|
|
with open(source, "r", encoding="utf-8") as f: |
|
|
return json.load(f) |
|
|
|
|
|
|
|
|
try: |
|
|
from datasets import load_dataset |
|
|
|
|
|
print(f"Loading benchmark from HuggingFace: {source}") |
|
|
ds = load_dataset(source, split="train") |
|
|
return ds.to_list() |
|
|
except Exception as e: |
|
|
print(f"Failed to load from HuggingFace: {e}") |
|
|
|
|
|
local_path = "benchmark_data/benchmark.json" |
|
|
if Path(local_path).exists(): |
|
|
print(f"Falling back to local file: {local_path}") |
|
|
with open(local_path, "r", encoding="utf-8") as f: |
|
|
return json.load(f) |
|
|
raise |
|
|
|
|
|
|
|
|
def get_benchmark_stats(pairs: list[dict]) -> dict: |
|
|
""" |
|
|
Get statistics about the benchmark dataset. |
|
|
|
|
|
Args: |
|
|
pairs: List of benchmark pairs |
|
|
|
|
|
Returns: |
|
|
Dict with category counts and other stats |
|
|
""" |
|
|
from collections import Counter |
|
|
|
|
|
categories = Counter(p["category"] for p in pairs) |
|
|
|
|
|
he_lengths = [len(p["he"]) for p in pairs] |
|
|
en_lengths = [len(p["en"]) for p in pairs] |
|
|
|
|
|
return { |
|
|
"total_pairs": len(pairs), |
|
|
"categories": dict(categories), |
|
|
"avg_he_length": sum(he_lengths) / len(he_lengths) if he_lengths else 0, |
|
|
"avg_en_length": sum(en_lengths) / len(en_lengths) if en_lengths else 0, |
|
|
} |
|
|
|
|
|
|
|
|
def save_stats_markdown(stats: dict, data_path: str) -> str: |
|
|
""" |
|
|
Save benchmark statistics to a markdown file alongside the data. |
|
|
|
|
|
Args: |
|
|
stats: Statistics dict from get_benchmark_stats() |
|
|
data_path: Path to the data file (used to derive stats file path) |
|
|
|
|
|
Returns: |
|
|
Path to the saved markdown file |
|
|
""" |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
data_file = Path(data_path) |
|
|
stats_path = data_file.with_suffix(".stats.md") |
|
|
|
|
|
|
|
|
lines = [ |
|
|
"# Benchmark Dataset Statistics", |
|
|
"", |
|
|
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", |
|
|
"", |
|
|
"## Summary", |
|
|
"", |
|
|
f"- **Total pairs:** {stats['total_pairs']:,}", |
|
|
f"- **Average Hebrew length:** {stats['avg_he_length']:.0f} chars", |
|
|
f"- **Average English length:** {stats['avg_en_length']:.0f} chars", |
|
|
"", |
|
|
"## Category Breakdown", |
|
|
"", |
|
|
"| Category | Count |", |
|
|
"|----------|-------|", |
|
|
] |
|
|
|
|
|
|
|
|
sorted_categories = sorted( |
|
|
stats["categories"].items(), |
|
|
key=lambda x: x[1], |
|
|
reverse=True |
|
|
) |
|
|
|
|
|
for category, count in sorted_categories: |
|
|
lines.append(f"| {category} | {count:,} |") |
|
|
|
|
|
lines.append("") |
|
|
|
|
|
|
|
|
with open(stats_path, "w", encoding="utf-8") as f: |
|
|
f.write("\n".join(lines)) |
|
|
|
|
|
print(f"Stats saved to: {stats_path}") |
|
|
return str(stats_path) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
print("Building Rabbinic Hebrew/Aramaic benchmark dataset...") |
|
|
pairs = build_benchmark_dataset() |
|
|
|
|
|
|
|
|
stats = get_benchmark_stats(pairs) |
|
|
print(f"\nDataset Statistics:") |
|
|
print(f" Total pairs: {stats['total_pairs']}") |
|
|
print(f" Categories: {stats['categories']}") |
|
|
print(f" Avg Hebrew length: {stats['avg_he_length']:.0f} chars") |
|
|
print(f" Avg English length: {stats['avg_en_length']:.0f} chars") |
|
|
|
|
|
|