""" Fetch vocalised Syriac word forms from SEDRA database. SEDRA (Syriac Electronic Data Retrieval Archive) provides: - Unpointed Syriac forms - West Syriac (Serto) vocalised forms - East Syriac (Madnḥaya) vocalised forms - Grammatical info (gender, number, tense, etc.) - English glosses Source: https://sedra.bethmardutho.org/ """ import asyncio import json from pathlib import Path import aiohttp from tqdm import tqdm BASE_URL = "https://sedra.bethmardutho.org/api" OUTPUT_FILE = Path(__file__).parent / "sedra_vocalised.jsonl" PROGRESS_FILE = Path(__file__).parent / ".sedra_progress" # Approximate upper bound of word IDs (found by probing) MAX_WORD_ID = 65000 BATCH_SIZE = 20 # Concurrent requests - keep low to avoid overwhelming the server DELAY_BETWEEN_BATCHES = 2.0 # Seconds between batches MAX_RETRIES = 3 SAVE_EVERY = 500 # Save progress every N words async def fetch_word( session: aiohttp.ClientSession, word_id: int, retries: int = 0 ) -> dict | None: """Fetch a single word from SEDRA API with retry logic.""" url = f"{BASE_URL}/word/{word_id}" try: async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp: if resp.status == 200: data = await resp.json() if data and len(data) > 0: return data[0] elif resp.status == 429 and retries < MAX_RETRIES: # Rate limited await asyncio.sleep(2**retries) # Exponential backoff return await fetch_word(session, word_id, retries + 1) except asyncio.TimeoutError: if retries < MAX_RETRIES: await asyncio.sleep(1) return await fetch_word(session, word_id, retries + 1) except Exception: pass return None async def fetch_batch( session: aiohttp.ClientSession, start_id: int, batch_size: int ) -> list[dict]: """Fetch a batch of words concurrently.""" tasks = [fetch_word(session, i) for i in range(start_id, start_id + batch_size)] results = await asyncio.gather(*tasks) return [r for r in results if r is not None] def check_existing_data() -> int: """Check if data file exists and count entries. Returns count or 0.""" if not OUTPUT_FILE.exists(): return 0 try: with open(OUTPUT_FILE, "r", encoding="utf-8") as f: return sum(1 for _ in f) except Exception: return 0 def load_progress() -> tuple[int, list[dict]]: """Load progress from checkpoint file. Returns (start_id, accumulated_words).""" if not PROGRESS_FILE.exists(): return 1, [] try: with open(PROGRESS_FILE, "r", encoding="utf-8") as f: data = json.load(f) return data.get("next_id", 1), data.get("words", []) except Exception: return 1, [] def save_progress(next_id: int, words: list[dict]): """Save progress to checkpoint file.""" with open(PROGRESS_FILE, "w", encoding="utf-8") as f: json.dump({"next_id": next_id, "words": words}, f) def clear_progress(): """Remove progress file.""" if PROGRESS_FILE.exists(): PROGRESS_FILE.unlink() def word_to_record(word: dict) -> dict: """Convert SEDRA word to our record format.""" return { "syriac": word.get("syriac", ""), "western": word.get("western", ""), "eastern": word.get("eastern", ""), "category": word.get("category", ""), "gender": word.get("gender", ""), "number": word.get("number", ""), "tense": word.get("tense", ""), "person": word.get("person", ""), "glosses": word.get("glosses", {}), } async def main(force: bool = False, resume: bool = True): """ Fetch all vocalised words from SEDRA. Args: force: If True, re-fetch even if data exists resume: If True, resume from last checkpoint """ # Check for existing complete data existing_count = check_existing_data() if existing_count == MAX_WORD_ID and not force: print(f"SEDRA data already exists with {existing_count} entries.") print("Use --force to re-download.") return # Check for resume start_id, all_words = (1, []) if force or not resume else load_progress() if start_id > 1: print( f"Resuming from word ID {start_id} ({len(all_words)} words already fetched)" ) else: print(f"Fetching SEDRA vocalised lexicon (~{MAX_WORD_ID} word forms)...") print("Progress is saved periodically. Press Ctrl+C to pause safely.") connector = aiohttp.TCPConnector(limit=BATCH_SIZE) batch_start = start_id # Track for interrupt handling try: async with aiohttp.ClientSession(connector=connector) as session: pbar = tqdm( total=MAX_WORD_ID, initial=start_id - 1, desc="Fetching SEDRA", unit="words", ) for batch_start in range(start_id, MAX_WORD_ID + 1, BATCH_SIZE): batch = await fetch_batch(session, batch_start, BATCH_SIZE) all_words.extend([word_to_record(w) for w in batch]) pbar.update(BATCH_SIZE) pbar.set_postfix({"found": len(all_words)}) # Save progress periodically if batch_start % SAVE_EVERY < BATCH_SIZE: save_progress(batch_start + BATCH_SIZE, all_words) await asyncio.sleep(DELAY_BETWEEN_BATCHES) pbar.close() except (KeyboardInterrupt, asyncio.CancelledError): print(f"\n\nPaused at word ID {batch_start}. Progress saved.") print("Run again to resume (or use --force to restart).") save_progress(batch_start, all_words) return # Save final results print(f"\nFetched {len(all_words)} vocalised words") with open(OUTPUT_FILE, "w", encoding="utf-8") as f: for record in all_words: f.write(json.dumps(record, ensure_ascii=False) + "\n") clear_progress() print(f"Saved to {OUTPUT_FILE}") def west_to_east_vowels(western_pointed: str) -> str: """ Convert West Syriac pointed text to East Syriac by replacing zqapha vowels. In West Syriac, zqapha (ܳ/ܴ/ܵ) represents o (the ā→o shift). In East Syriac, zqapha represents ā (preserves original). This is a heuristic fallback when no Eastern form is available. The consonants and other vowels remain the same. """ # Zqapha vowels - the key difference between West and East # These represent o in West but ā in East # We don't need to change the Syriac characters themselves, # just use the same pointed text and transliterate with East rules return western_pointed def generate_training_pairs(): """ Generate training pairs from SEDRA vocalised data. Creates high-quality Syriac → Latin transliteration pairs using the vocalised forms from SEDRA. Fallback strategy: - If Eastern form is missing, use Western form with East transliteration rules (which converts zqapha o → ā automatically) - Skip entries that have neither Western nor Eastern forms """ from generate_syr_lat_pairs import (SyriacDialect, transliterate_word_pointed) input_file = OUTPUT_FILE west_output = Path(__file__).parent / "sedra_west_pairs.jsonl" east_output = Path(__file__).parent / "sedra_east_pairs.jsonl" if not input_file.exists(): print(f"Run the fetch first: python {__file__}") return west_pairs = [] east_pairs = [] skipped_no_vocalised = 0 with open(input_file, "r", encoding="utf-8") as f: for line in tqdm(f, desc="Generating transliteration pairs"): record = json.loads(line) unpointed = record["syriac"] western = record["western"] eastern = record["eastern"] if not unpointed: continue # Skip entries that have neither Western nor Eastern vocalised forms if not western and not eastern: skipped_no_vocalised += 1 continue # Generate West Syriac pair (using vocalised western form) if western: west_latin = transliterate_word_pointed(western, SyriacDialect.WEST) if west_latin and len(west_latin) > 1: west_pairs.append( { "transliteration": { "src": unpointed, # Input: unpointed "tgt": west_latin, # Output: vocalised Latin "src_pointed": western, # Reference "dialect": "west", "source": "sedra", "category": record.get("category", ""), } } ) # Generate East Syriac pair # If Eastern form exists, use it; otherwise fall back to Western with East rules if eastern: east_pointed = eastern source_note = "sedra" elif western: # Fallback: use Western pointed form but transliterate with East rules # This automatically converts zqapha (ā in West) to (o in East) east_pointed = western source_note = "sedra-west2east" else: continue # Already handled above, but defensive east_latin = transliterate_word_pointed(east_pointed, SyriacDialect.EAST) if east_latin and len(east_latin) > 1: east_pairs.append( { "transliteration": { "src": unpointed, # Input: unpointed "tgt": east_latin, # Output: vocalised Latin "src_pointed": east_pointed, # Reference "dialect": "east", "source": source_note, "category": record.get("category", ""), } } ) # Save pairs with open(west_output, "w", encoding="utf-8") as f: for pair in west_pairs: f.write(json.dumps(pair, ensure_ascii=False) + "\n") with open(east_output, "w", encoding="utf-8") as f: for pair in east_pairs: f.write(json.dumps(pair, ensure_ascii=False) + "\n") # Count how many East pairs used the West→East fallback fallback_count = sum( 1 for p in east_pairs if p["transliteration"]["source"] == "sedra-west2east" ) print(f"\nGenerated {len(west_pairs)} West Syriac pairs → {west_output}") print(f"Generated {len(east_pairs)} East Syriac pairs → {east_output}") print(f" (of which {fallback_count} used West→East fallback with o→ā conversion)") print(f"Skipped {skipped_no_vocalised} entries with no vocalised forms") if __name__ == "__main__": import sys args = sys.argv[1:] if "--pairs" in args: generate_training_pairs() elif "--help" in args or "-h" in args: print("Usage: uv run python fetch_sedra_vocalised.py [OPTIONS]") print() print( "Fetches 65k (by default) vocalised Syriac word IDs from the SEDRA database." ) print( "Download time depends on API rate limits, but usually ranges from 2-3 hours to ~7 hours." ) print("Progress is saved automatically - you can resume if interrupted.") print() print("Options:") print(" --force Re-fetch data even if it already exists") print(" --pairs Generate training pairs from existing SEDRA data") print(" --help Show this help message") print() print( "Tip: Run in background with: nohup uv run python fetch_sedra_vocalised.py &" ) else: force = "--force" in args try: asyncio.run(main(force=force)) except KeyboardInterrupt: print("\nInterrupted. Progress saved - run again to resume.")