File size: 12,292 Bytes

"""
Fetch vocalised Syriac word forms from SEDRA database.

SEDRA (Syriac Electronic Data Retrieval Archive) provides:
- Unpointed Syriac forms
- West Syriac (Serto) vocalised forms
- East Syriac (Madnḥaya) vocalised forms
- Grammatical info (gender, number, tense, etc.)
- English glosses

Source: https://sedra.bethmardutho.org/
"""

import asyncio
import json
from pathlib import Path

import aiohttp
from tqdm import tqdm

BASE_URL = "https://sedra.bethmardutho.org/api"
OUTPUT_FILE = Path(__file__).parent / "sedra_vocalised.jsonl"
PROGRESS_FILE = Path(__file__).parent / ".sedra_progress"

# Approximate upper bound of word IDs (found by probing)
MAX_WORD_ID = 65000
BATCH_SIZE = 20  # Concurrent requests - keep low to avoid overwhelming the server
DELAY_BETWEEN_BATCHES = 2.0  # Seconds between batches
MAX_RETRIES = 3
SAVE_EVERY = 500  # Save progress every N words


async def fetch_word(
    session: aiohttp.ClientSession, word_id: int, retries: int = 0
) -> dict | None:
    """Fetch a single word from SEDRA API with retry logic."""
    url = f"{BASE_URL}/word/{word_id}"
    try:
        async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
            if resp.status == 200:
                data = await resp.json()
                if data and len(data) > 0:
                    return data[0]
            elif resp.status == 429 and retries < MAX_RETRIES:  # Rate limited
                await asyncio.sleep(2**retries)  # Exponential backoff
                return await fetch_word(session, word_id, retries + 1)
    except asyncio.TimeoutError:
        if retries < MAX_RETRIES:
            await asyncio.sleep(1)
            return await fetch_word(session, word_id, retries + 1)
    except Exception:
        pass
    return None


async def fetch_batch(
    session: aiohttp.ClientSession, start_id: int, batch_size: int
) -> list[dict]:
    """Fetch a batch of words concurrently."""
    tasks = [fetch_word(session, i) for i in range(start_id, start_id + batch_size)]
    results = await asyncio.gather(*tasks)
    return [r for r in results if r is not None]


def check_existing_data() -> int:
    """Check if data file exists and count entries. Returns count or 0."""
    if not OUTPUT_FILE.exists():
        return 0
    try:
        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
            return sum(1 for _ in f)
    except Exception:
        return 0


def load_progress() -> tuple[int, list[dict]]:
    """Load progress from checkpoint file. Returns (start_id, accumulated_words)."""
    if not PROGRESS_FILE.exists():
        return 1, []
    try:
        with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
            data = json.load(f)
            return data.get("next_id", 1), data.get("words", [])
    except Exception:
        return 1, []


def save_progress(next_id: int, words: list[dict]):
    """Save progress to checkpoint file."""
    with open(PROGRESS_FILE, "w", encoding="utf-8") as f:
        json.dump({"next_id": next_id, "words": words}, f)


def clear_progress():
    """Remove progress file."""
    if PROGRESS_FILE.exists():
        PROGRESS_FILE.unlink()


def word_to_record(word: dict) -> dict:
    """Convert SEDRA word to our record format."""
    return {
        "syriac": word.get("syriac", ""),
        "western": word.get("western", ""),
        "eastern": word.get("eastern", ""),
        "category": word.get("category", ""),
        "gender": word.get("gender", ""),
        "number": word.get("number", ""),
        "tense": word.get("tense", ""),
        "person": word.get("person", ""),
        "glosses": word.get("glosses", {}),
    }


async def main(force: bool = False, resume: bool = True):
    """
    Fetch all vocalised words from SEDRA.

    Args:
        force: If True, re-fetch even if data exists
        resume: If True, resume from last checkpoint
    """
    # Check for existing complete data
    existing_count = check_existing_data()
    if existing_count == MAX_WORD_ID and not force:
        print(f"SEDRA data already exists with {existing_count} entries.")
        print("Use --force to re-download.")
        return

    # Check for resume
    start_id, all_words = (1, []) if force or not resume else load_progress()

    if start_id > 1:
        print(
            f"Resuming from word ID {start_id} ({len(all_words)} words already fetched)"
        )
    else:
        print(f"Fetching SEDRA vocalised lexicon (~{MAX_WORD_ID} word forms)...")

    print("Progress is saved periodically. Press Ctrl+C to pause safely.")

    connector = aiohttp.TCPConnector(limit=BATCH_SIZE)
    batch_start = start_id  # Track for interrupt handling
    try:
        async with aiohttp.ClientSession(connector=connector) as session:
            pbar = tqdm(
                total=MAX_WORD_ID,
                initial=start_id - 1,
                desc="Fetching SEDRA",
                unit="words",
            )

            for batch_start in range(start_id, MAX_WORD_ID + 1, BATCH_SIZE):
                batch = await fetch_batch(session, batch_start, BATCH_SIZE)
                all_words.extend([word_to_record(w) for w in batch])
                pbar.update(BATCH_SIZE)
                pbar.set_postfix({"found": len(all_words)})

                # Save progress periodically
                if batch_start % SAVE_EVERY < BATCH_SIZE:
                    save_progress(batch_start + BATCH_SIZE, all_words)

                await asyncio.sleep(DELAY_BETWEEN_BATCHES)

            pbar.close()

    except (KeyboardInterrupt, asyncio.CancelledError):
        print(f"\n\nPaused at word ID {batch_start}. Progress saved.")
        print("Run again to resume (or use --force to restart).")
        save_progress(batch_start, all_words)
        return

    # Save final results
    print(f"\nFetched {len(all_words)} vocalised words")

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for record in all_words:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    clear_progress()
    print(f"Saved to {OUTPUT_FILE}")


def west_to_east_vowels(western_pointed: str) -> str:
    """
    Convert West Syriac pointed text to East Syriac by replacing zqapha vowels.

    In West Syriac, zqapha (ܳ/ܴ/ܵ) represents o (the ā→o shift).
    In East Syriac, zqapha represents ā (preserves original).

    This is a heuristic fallback when no Eastern form is available.
    The consonants and other vowels remain the same.
    """
    # Zqapha vowels - the key difference between West and East
    # These represent o in West but ā in East
    # We don't need to change the Syriac characters themselves,
    # just use the same pointed text and transliterate with East rules
    return western_pointed


def generate_training_pairs():
    """
    Generate training pairs from SEDRA vocalised data.

    Creates high-quality Syriac → Latin transliteration pairs
    using the vocalised forms from SEDRA.

    Fallback strategy:
    - If Eastern form is missing, use Western form with East transliteration rules
      (which converts zqapha o → ā automatically)
    - Skip entries that have neither Western nor Eastern forms
    """
    from generate_syr_lat_pairs import (SyriacDialect,
                                        transliterate_word_pointed)

    input_file = OUTPUT_FILE
    west_output = Path(__file__).parent / "sedra_west_pairs.jsonl"
    east_output = Path(__file__).parent / "sedra_east_pairs.jsonl"

    if not input_file.exists():
        print(f"Run the fetch first: python {__file__}")
        return

    west_pairs = []
    east_pairs = []
    skipped_no_vocalised = 0

    with open(input_file, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Generating transliteration pairs"):
            record = json.loads(line)

            unpointed = record["syriac"]
            western = record["western"]
            eastern = record["eastern"]

            if not unpointed:
                continue

            # Skip entries that have neither Western nor Eastern vocalised forms
            if not western and not eastern:
                skipped_no_vocalised += 1
                continue

            # Generate West Syriac pair (using vocalised western form)
            if western:
                west_latin = transliterate_word_pointed(western, SyriacDialect.WEST)
                if west_latin and len(west_latin) > 1:
                    west_pairs.append(
                        {
                            "transliteration": {
                                "src": unpointed,  # Input: unpointed
                                "tgt": west_latin,  # Output: vocalised Latin
                                "src_pointed": western,  # Reference
                                "dialect": "west",
                                "source": "sedra",
                                "category": record.get("category", ""),
                            }
                        }
                    )

            # Generate East Syriac pair
            # If Eastern form exists, use it; otherwise fall back to Western with East rules
            if eastern:
                east_pointed = eastern
                source_note = "sedra"
            elif western:
                # Fallback: use Western pointed form but transliterate with East rules
                # This automatically converts zqapha (ā in West) to (o in East)
                east_pointed = western
                source_note = "sedra-west2east"
            else:
                continue  # Already handled above, but defensive

            east_latin = transliterate_word_pointed(east_pointed, SyriacDialect.EAST)
            if east_latin and len(east_latin) > 1:
                east_pairs.append(
                    {
                        "transliteration": {
                            "src": unpointed,  # Input: unpointed
                            "tgt": east_latin,  # Output: vocalised Latin
                            "src_pointed": east_pointed,  # Reference
                            "dialect": "east",
                            "source": source_note,
                            "category": record.get("category", ""),
                        }
                    }
                )

    # Save pairs
    with open(west_output, "w", encoding="utf-8") as f:
        for pair in west_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + "\n")

    with open(east_output, "w", encoding="utf-8") as f:
        for pair in east_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + "\n")

    # Count how many East pairs used the West→East fallback
    fallback_count = sum(
        1 for p in east_pairs if p["transliteration"]["source"] == "sedra-west2east"
    )

    print(f"\nGenerated {len(west_pairs)} West Syriac pairs → {west_output}")
    print(f"Generated {len(east_pairs)} East Syriac pairs → {east_output}")
    print(f"  (of which {fallback_count} used West→East fallback with o→ā conversion)")
    print(f"Skipped {skipped_no_vocalised} entries with no vocalised forms")


if __name__ == "__main__":
    import sys

    args = sys.argv[1:]

    if "--pairs" in args:
        generate_training_pairs()
    elif "--help" in args or "-h" in args:
        print("Usage: uv run python fetch_sedra_vocalised.py [OPTIONS]")
        print()
        print(
            "Fetches 65k (by default) vocalised Syriac word IDs from the SEDRA database."
        )
        print(
            "Download time depends on API rate limits, but usually ranges from 2-3 hours to ~7 hours."
        )
        print("Progress is saved automatically - you can resume if interrupted.")
        print()
        print("Options:")
        print("  --force    Re-fetch data even if it already exists")
        print("  --pairs    Generate training pairs from existing SEDRA data")
        print("  --help     Show this help message")
        print()
        print(
            "Tip: Run in background with: nohup uv run python fetch_sedra_vocalised.py &"
        )
    else:
        force = "--force" in args
        try:
            asyncio.run(main(force=force))
        except KeyboardInterrupt:
            print("\nInterrupted. Progress saved - run again to resume.")