| """ |
| Fetch vocalised Syriac word forms from SEDRA database. |
| |
| SEDRA (Syriac Electronic Data Retrieval Archive) provides: |
| - Unpointed Syriac forms |
| - West Syriac (Serto) vocalised forms |
| - East Syriac (Madnḥaya) vocalised forms |
| - Grammatical info (gender, number, tense, etc.) |
| - English glosses |
| |
| Source: https://sedra.bethmardutho.org/ |
| """ |
|
|
| import asyncio |
| import json |
| from pathlib import Path |
|
|
| import aiohttp |
| from tqdm import tqdm |
|
|
| BASE_URL = "https://sedra.bethmardutho.org/api" |
| OUTPUT_FILE = Path(__file__).parent / "sedra_vocalised.jsonl" |
| PROGRESS_FILE = Path(__file__).parent / ".sedra_progress" |
|
|
| |
| MAX_WORD_ID = 65000 |
| BATCH_SIZE = 20 |
| DELAY_BETWEEN_BATCHES = 2.0 |
| MAX_RETRIES = 3 |
| SAVE_EVERY = 500 |
|
|
|
|
| async def fetch_word( |
| session: aiohttp.ClientSession, word_id: int, retries: int = 0 |
| ) -> dict | None: |
| """Fetch a single word from SEDRA API with retry logic.""" |
| url = f"{BASE_URL}/word/{word_id}" |
| try: |
| async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp: |
| if resp.status == 200: |
| data = await resp.json() |
| if data and len(data) > 0: |
| return data[0] |
| elif resp.status == 429 and retries < MAX_RETRIES: |
| await asyncio.sleep(2**retries) |
| return await fetch_word(session, word_id, retries + 1) |
| except asyncio.TimeoutError: |
| if retries < MAX_RETRIES: |
| await asyncio.sleep(1) |
| return await fetch_word(session, word_id, retries + 1) |
| except Exception: |
| pass |
| return None |
|
|
|
|
| async def fetch_batch( |
| session: aiohttp.ClientSession, start_id: int, batch_size: int |
| ) -> list[dict]: |
| """Fetch a batch of words concurrently.""" |
| tasks = [fetch_word(session, i) for i in range(start_id, start_id + batch_size)] |
| results = await asyncio.gather(*tasks) |
| return [r for r in results if r is not None] |
|
|
|
|
| def check_existing_data() -> int: |
| """Check if data file exists and count entries. Returns count or 0.""" |
| if not OUTPUT_FILE.exists(): |
| return 0 |
| try: |
| with open(OUTPUT_FILE, "r", encoding="utf-8") as f: |
| return sum(1 for _ in f) |
| except Exception: |
| return 0 |
|
|
|
|
| def load_progress() -> tuple[int, list[dict]]: |
| """Load progress from checkpoint file. Returns (start_id, accumulated_words).""" |
| if not PROGRESS_FILE.exists(): |
| return 1, [] |
| try: |
| with open(PROGRESS_FILE, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| return data.get("next_id", 1), data.get("words", []) |
| except Exception: |
| return 1, [] |
|
|
|
|
| def save_progress(next_id: int, words: list[dict]): |
| """Save progress to checkpoint file.""" |
| with open(PROGRESS_FILE, "w", encoding="utf-8") as f: |
| json.dump({"next_id": next_id, "words": words}, f) |
|
|
|
|
| def clear_progress(): |
| """Remove progress file.""" |
| if PROGRESS_FILE.exists(): |
| PROGRESS_FILE.unlink() |
|
|
|
|
| def word_to_record(word: dict) -> dict: |
| """Convert SEDRA word to our record format.""" |
| return { |
| "syriac": word.get("syriac", ""), |
| "western": word.get("western", ""), |
| "eastern": word.get("eastern", ""), |
| "category": word.get("category", ""), |
| "gender": word.get("gender", ""), |
| "number": word.get("number", ""), |
| "tense": word.get("tense", ""), |
| "person": word.get("person", ""), |
| "glosses": word.get("glosses", {}), |
| } |
|
|
|
|
| async def main(force: bool = False, resume: bool = True): |
| """ |
| Fetch all vocalised words from SEDRA. |
| |
| Args: |
| force: If True, re-fetch even if data exists |
| resume: If True, resume from last checkpoint |
| """ |
| |
| existing_count = check_existing_data() |
| if existing_count == MAX_WORD_ID and not force: |
| print(f"SEDRA data already exists with {existing_count} entries.") |
| print("Use --force to re-download.") |
| return |
|
|
| |
| start_id, all_words = (1, []) if force or not resume else load_progress() |
|
|
| if start_id > 1: |
| print( |
| f"Resuming from word ID {start_id} ({len(all_words)} words already fetched)" |
| ) |
| else: |
| print(f"Fetching SEDRA vocalised lexicon (~{MAX_WORD_ID} word forms)...") |
|
|
| print("Progress is saved periodically. Press Ctrl+C to pause safely.") |
|
|
| connector = aiohttp.TCPConnector(limit=BATCH_SIZE) |
| batch_start = start_id |
| try: |
| async with aiohttp.ClientSession(connector=connector) as session: |
| pbar = tqdm( |
| total=MAX_WORD_ID, |
| initial=start_id - 1, |
| desc="Fetching SEDRA", |
| unit="words", |
| ) |
|
|
| for batch_start in range(start_id, MAX_WORD_ID + 1, BATCH_SIZE): |
| batch = await fetch_batch(session, batch_start, BATCH_SIZE) |
| all_words.extend([word_to_record(w) for w in batch]) |
| pbar.update(BATCH_SIZE) |
| pbar.set_postfix({"found": len(all_words)}) |
|
|
| |
| if batch_start % SAVE_EVERY < BATCH_SIZE: |
| save_progress(batch_start + BATCH_SIZE, all_words) |
|
|
| await asyncio.sleep(DELAY_BETWEEN_BATCHES) |
|
|
| pbar.close() |
|
|
| except (KeyboardInterrupt, asyncio.CancelledError): |
| print(f"\n\nPaused at word ID {batch_start}. Progress saved.") |
| print("Run again to resume (or use --force to restart).") |
| save_progress(batch_start, all_words) |
| return |
|
|
| |
| print(f"\nFetched {len(all_words)} vocalised words") |
|
|
| with open(OUTPUT_FILE, "w", encoding="utf-8") as f: |
| for record in all_words: |
| f.write(json.dumps(record, ensure_ascii=False) + "\n") |
|
|
| clear_progress() |
| print(f"Saved to {OUTPUT_FILE}") |
|
|
|
|
| def west_to_east_vowels(western_pointed: str) -> str: |
| """ |
| Convert West Syriac pointed text to East Syriac by replacing zqapha vowels. |
| |
| In West Syriac, zqapha (ܳ/ܴ/ܵ) represents o (the ā→o shift). |
| In East Syriac, zqapha represents ā (preserves original). |
| |
| This is a heuristic fallback when no Eastern form is available. |
| The consonants and other vowels remain the same. |
| """ |
| |
| |
| |
| |
| return western_pointed |
|
|
|
|
| def generate_training_pairs(): |
| """ |
| Generate training pairs from SEDRA vocalised data. |
| |
| Creates high-quality Syriac → Latin transliteration pairs |
| using the vocalised forms from SEDRA. |
| |
| Fallback strategy: |
| - If Eastern form is missing, use Western form with East transliteration rules |
| (which converts zqapha o → ā automatically) |
| - Skip entries that have neither Western nor Eastern forms |
| """ |
| from generate_syr_lat_pairs import (SyriacDialect, |
| transliterate_word_pointed) |
|
|
| input_file = OUTPUT_FILE |
| west_output = Path(__file__).parent / "sedra_west_pairs.jsonl" |
| east_output = Path(__file__).parent / "sedra_east_pairs.jsonl" |
|
|
| if not input_file.exists(): |
| print(f"Run the fetch first: python {__file__}") |
| return |
|
|
| west_pairs = [] |
| east_pairs = [] |
| skipped_no_vocalised = 0 |
|
|
| with open(input_file, "r", encoding="utf-8") as f: |
| for line in tqdm(f, desc="Generating transliteration pairs"): |
| record = json.loads(line) |
|
|
| unpointed = record["syriac"] |
| western = record["western"] |
| eastern = record["eastern"] |
|
|
| if not unpointed: |
| continue |
|
|
| |
| if not western and not eastern: |
| skipped_no_vocalised += 1 |
| continue |
|
|
| |
| if western: |
| west_latin = transliterate_word_pointed(western, SyriacDialect.WEST) |
| if west_latin and len(west_latin) > 1: |
| west_pairs.append( |
| { |
| "transliteration": { |
| "src": unpointed, |
| "tgt": west_latin, |
| "src_pointed": western, |
| "dialect": "west", |
| "source": "sedra", |
| "category": record.get("category", ""), |
| } |
| } |
| ) |
|
|
| |
| |
| if eastern: |
| east_pointed = eastern |
| source_note = "sedra" |
| elif western: |
| |
| |
| east_pointed = western |
| source_note = "sedra-west2east" |
| else: |
| continue |
|
|
| east_latin = transliterate_word_pointed(east_pointed, SyriacDialect.EAST) |
| if east_latin and len(east_latin) > 1: |
| east_pairs.append( |
| { |
| "transliteration": { |
| "src": unpointed, |
| "tgt": east_latin, |
| "src_pointed": east_pointed, |
| "dialect": "east", |
| "source": source_note, |
| "category": record.get("category", ""), |
| } |
| } |
| ) |
|
|
| |
| with open(west_output, "w", encoding="utf-8") as f: |
| for pair in west_pairs: |
| f.write(json.dumps(pair, ensure_ascii=False) + "\n") |
|
|
| with open(east_output, "w", encoding="utf-8") as f: |
| for pair in east_pairs: |
| f.write(json.dumps(pair, ensure_ascii=False) + "\n") |
|
|
| |
| fallback_count = sum( |
| 1 for p in east_pairs if p["transliteration"]["source"] == "sedra-west2east" |
| ) |
|
|
| print(f"\nGenerated {len(west_pairs)} West Syriac pairs → {west_output}") |
| print(f"Generated {len(east_pairs)} East Syriac pairs → {east_output}") |
| print(f" (of which {fallback_count} used West→East fallback with o→ā conversion)") |
| print(f"Skipped {skipped_no_vocalised} entries with no vocalised forms") |
|
|
|
|
| if __name__ == "__main__": |
| import sys |
|
|
| args = sys.argv[1:] |
|
|
| if "--pairs" in args: |
| generate_training_pairs() |
| elif "--help" in args or "-h" in args: |
| print("Usage: uv run python fetch_sedra_vocalised.py [OPTIONS]") |
| print() |
| print( |
| "Fetches 65k (by default) vocalised Syriac word IDs from the SEDRA database." |
| ) |
| print( |
| "Download time depends on API rate limits, but usually ranges from 2-3 hours to ~7 hours." |
| ) |
| print("Progress is saved automatically - you can resume if interrupted.") |
| print() |
| print("Options:") |
| print(" --force Re-fetch data even if it already exists") |
| print(" --pairs Generate training pairs from existing SEDRA data") |
| print(" --help Show this help message") |
| print() |
| print( |
| "Tip: Run in background with: nohup uv run python fetch_sedra_vocalised.py &" |
| ) |
| else: |
| force = "--force" in args |
| try: |
| asyncio.run(main(force=force)) |
| except KeyboardInterrupt: |
| print("\nInterrupted. Progress saved - run again to resume.") |
|
|