aramt5 / src /data /fetch_sedra_vocalised.py
crossroderick's picture
Data augmentation and balancing updates for a re-run of v3
11632a3
"""
Fetch vocalised Syriac word forms from SEDRA database.
SEDRA (Syriac Electronic Data Retrieval Archive) provides:
- Unpointed Syriac forms
- West Syriac (Serto) vocalised forms
- East Syriac (Madnḥaya) vocalised forms
- Grammatical info (gender, number, tense, etc.)
- English glosses
Source: https://sedra.bethmardutho.org/
"""
import asyncio
import json
from pathlib import Path
import aiohttp
from tqdm import tqdm
BASE_URL = "https://sedra.bethmardutho.org/api"
OUTPUT_FILE = Path(__file__).parent / "sedra_vocalised.jsonl"
PROGRESS_FILE = Path(__file__).parent / ".sedra_progress"
# Approximate upper bound of word IDs (found by probing)
MAX_WORD_ID = 65000
BATCH_SIZE = 20 # Concurrent requests - keep low to avoid overwhelming the server
DELAY_BETWEEN_BATCHES = 2.0 # Seconds between batches
MAX_RETRIES = 3
SAVE_EVERY = 500 # Save progress every N words
async def fetch_word(
session: aiohttp.ClientSession, word_id: int, retries: int = 0
) -> dict | None:
"""Fetch a single word from SEDRA API with retry logic."""
url = f"{BASE_URL}/word/{word_id}"
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
if resp.status == 200:
data = await resp.json()
if data and len(data) > 0:
return data[0]
elif resp.status == 429 and retries < MAX_RETRIES: # Rate limited
await asyncio.sleep(2**retries) # Exponential backoff
return await fetch_word(session, word_id, retries + 1)
except asyncio.TimeoutError:
if retries < MAX_RETRIES:
await asyncio.sleep(1)
return await fetch_word(session, word_id, retries + 1)
except Exception:
pass
return None
async def fetch_batch(
session: aiohttp.ClientSession, start_id: int, batch_size: int
) -> list[dict]:
"""Fetch a batch of words concurrently."""
tasks = [fetch_word(session, i) for i in range(start_id, start_id + batch_size)]
results = await asyncio.gather(*tasks)
return [r for r in results if r is not None]
def check_existing_data() -> int:
"""Check if data file exists and count entries. Returns count or 0."""
if not OUTPUT_FILE.exists():
return 0
try:
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
return sum(1 for _ in f)
except Exception:
return 0
def load_progress() -> tuple[int, list[dict]]:
"""Load progress from checkpoint file. Returns (start_id, accumulated_words)."""
if not PROGRESS_FILE.exists():
return 1, []
try:
with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
return data.get("next_id", 1), data.get("words", [])
except Exception:
return 1, []
def save_progress(next_id: int, words: list[dict]):
"""Save progress to checkpoint file."""
with open(PROGRESS_FILE, "w", encoding="utf-8") as f:
json.dump({"next_id": next_id, "words": words}, f)
def clear_progress():
"""Remove progress file."""
if PROGRESS_FILE.exists():
PROGRESS_FILE.unlink()
def word_to_record(word: dict) -> dict:
"""Convert SEDRA word to our record format."""
return {
"syriac": word.get("syriac", ""),
"western": word.get("western", ""),
"eastern": word.get("eastern", ""),
"category": word.get("category", ""),
"gender": word.get("gender", ""),
"number": word.get("number", ""),
"tense": word.get("tense", ""),
"person": word.get("person", ""),
"glosses": word.get("glosses", {}),
}
async def main(force: bool = False, resume: bool = True):
"""
Fetch all vocalised words from SEDRA.
Args:
force: If True, re-fetch even if data exists
resume: If True, resume from last checkpoint
"""
# Check for existing complete data
existing_count = check_existing_data()
if existing_count == MAX_WORD_ID and not force:
print(f"SEDRA data already exists with {existing_count} entries.")
print("Use --force to re-download.")
return
# Check for resume
start_id, all_words = (1, []) if force or not resume else load_progress()
if start_id > 1:
print(
f"Resuming from word ID {start_id} ({len(all_words)} words already fetched)"
)
else:
print(f"Fetching SEDRA vocalised lexicon (~{MAX_WORD_ID} word forms)...")
print("Progress is saved periodically. Press Ctrl+C to pause safely.")
connector = aiohttp.TCPConnector(limit=BATCH_SIZE)
batch_start = start_id # Track for interrupt handling
try:
async with aiohttp.ClientSession(connector=connector) as session:
pbar = tqdm(
total=MAX_WORD_ID,
initial=start_id - 1,
desc="Fetching SEDRA",
unit="words",
)
for batch_start in range(start_id, MAX_WORD_ID + 1, BATCH_SIZE):
batch = await fetch_batch(session, batch_start, BATCH_SIZE)
all_words.extend([word_to_record(w) for w in batch])
pbar.update(BATCH_SIZE)
pbar.set_postfix({"found": len(all_words)})
# Save progress periodically
if batch_start % SAVE_EVERY < BATCH_SIZE:
save_progress(batch_start + BATCH_SIZE, all_words)
await asyncio.sleep(DELAY_BETWEEN_BATCHES)
pbar.close()
except (KeyboardInterrupt, asyncio.CancelledError):
print(f"\n\nPaused at word ID {batch_start}. Progress saved.")
print("Run again to resume (or use --force to restart).")
save_progress(batch_start, all_words)
return
# Save final results
print(f"\nFetched {len(all_words)} vocalised words")
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for record in all_words:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
clear_progress()
print(f"Saved to {OUTPUT_FILE}")
def west_to_east_vowels(western_pointed: str) -> str:
"""
Convert West Syriac pointed text to East Syriac by replacing zqapha vowels.
In West Syriac, zqapha (ܳ/ܴ/ܵ) represents o (the ā→o shift).
In East Syriac, zqapha represents ā (preserves original).
This is a heuristic fallback when no Eastern form is available.
The consonants and other vowels remain the same.
"""
# Zqapha vowels - the key difference between West and East
# These represent o in West but ā in East
# We don't need to change the Syriac characters themselves,
# just use the same pointed text and transliterate with East rules
return western_pointed
def generate_training_pairs():
"""
Generate training pairs from SEDRA vocalised data.
Creates high-quality Syriac → Latin transliteration pairs
using the vocalised forms from SEDRA.
Fallback strategy:
- If Eastern form is missing, use Western form with East transliteration rules
(which converts zqapha o → ā automatically)
- Skip entries that have neither Western nor Eastern forms
"""
from generate_syr_lat_pairs import (SyriacDialect,
transliterate_word_pointed)
input_file = OUTPUT_FILE
west_output = Path(__file__).parent / "sedra_west_pairs.jsonl"
east_output = Path(__file__).parent / "sedra_east_pairs.jsonl"
if not input_file.exists():
print(f"Run the fetch first: python {__file__}")
return
west_pairs = []
east_pairs = []
skipped_no_vocalised = 0
with open(input_file, "r", encoding="utf-8") as f:
for line in tqdm(f, desc="Generating transliteration pairs"):
record = json.loads(line)
unpointed = record["syriac"]
western = record["western"]
eastern = record["eastern"]
if not unpointed:
continue
# Skip entries that have neither Western nor Eastern vocalised forms
if not western and not eastern:
skipped_no_vocalised += 1
continue
# Generate West Syriac pair (using vocalised western form)
if western:
west_latin = transliterate_word_pointed(western, SyriacDialect.WEST)
if west_latin and len(west_latin) > 1:
west_pairs.append(
{
"transliteration": {
"src": unpointed, # Input: unpointed
"tgt": west_latin, # Output: vocalised Latin
"src_pointed": western, # Reference
"dialect": "west",
"source": "sedra",
"category": record.get("category", ""),
}
}
)
# Generate East Syriac pair
# If Eastern form exists, use it; otherwise fall back to Western with East rules
if eastern:
east_pointed = eastern
source_note = "sedra"
elif western:
# Fallback: use Western pointed form but transliterate with East rules
# This automatically converts zqapha (ā in West) to (o in East)
east_pointed = western
source_note = "sedra-west2east"
else:
continue # Already handled above, but defensive
east_latin = transliterate_word_pointed(east_pointed, SyriacDialect.EAST)
if east_latin and len(east_latin) > 1:
east_pairs.append(
{
"transliteration": {
"src": unpointed, # Input: unpointed
"tgt": east_latin, # Output: vocalised Latin
"src_pointed": east_pointed, # Reference
"dialect": "east",
"source": source_note,
"category": record.get("category", ""),
}
}
)
# Save pairs
with open(west_output, "w", encoding="utf-8") as f:
for pair in west_pairs:
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
with open(east_output, "w", encoding="utf-8") as f:
for pair in east_pairs:
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
# Count how many East pairs used the West→East fallback
fallback_count = sum(
1 for p in east_pairs if p["transliteration"]["source"] == "sedra-west2east"
)
print(f"\nGenerated {len(west_pairs)} West Syriac pairs → {west_output}")
print(f"Generated {len(east_pairs)} East Syriac pairs → {east_output}")
print(f" (of which {fallback_count} used West→East fallback with o→ā conversion)")
print(f"Skipped {skipped_no_vocalised} entries with no vocalised forms")
if __name__ == "__main__":
import sys
args = sys.argv[1:]
if "--pairs" in args:
generate_training_pairs()
elif "--help" in args or "-h" in args:
print("Usage: uv run python fetch_sedra_vocalised.py [OPTIONS]")
print()
print(
"Fetches 65k (by default) vocalised Syriac word IDs from the SEDRA database."
)
print(
"Download time depends on API rate limits, but usually ranges from 2-3 hours to ~7 hours."
)
print("Progress is saved automatically - you can resume if interrupted.")
print()
print("Options:")
print(" --force Re-fetch data even if it already exists")
print(" --pairs Generate training pairs from existing SEDRA data")
print(" --help Show this help message")
print()
print(
"Tip: Run in background with: nohup uv run python fetch_sedra_vocalised.py &"
)
else:
force = "--force" in args
try:
asyncio.run(main(force=force))
except KeyboardInterrupt:
print("\nInterrupted. Progress saved - run again to resume.")