File size: 12,292 Bytes
a4462f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e90fca5
 
a4462f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e90fca5
 
 
11632a3
e90fca5
 
11632a3
e90fca5
 
 
 
 
 
 
 
 
 
a4462f5
 
 
 
 
 
11632a3
e90fca5
 
 
 
a4462f5
 
 
 
 
 
 
 
 
 
 
 
 
 
e90fca5
a4462f5
 
 
 
 
 
 
 
 
 
 
 
e90fca5
 
 
 
 
a4462f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e90fca5
 
a4462f5
e90fca5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4462f5
e90fca5
 
a4462f5
 
 
 
 
 
 
 
 
 
e90fca5
11632a3
 
 
e90fca5
 
a4462f5
e90fca5
 
a4462f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11632a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
"""
Fetch vocalised Syriac word forms from SEDRA database.

SEDRA (Syriac Electronic Data Retrieval Archive) provides:
- Unpointed Syriac forms
- West Syriac (Serto) vocalised forms
- East Syriac (Madnḥaya) vocalised forms
- Grammatical info (gender, number, tense, etc.)
- English glosses

Source: https://sedra.bethmardutho.org/
"""

import asyncio
import json
from pathlib import Path

import aiohttp
from tqdm import tqdm

BASE_URL = "https://sedra.bethmardutho.org/api"
OUTPUT_FILE = Path(__file__).parent / "sedra_vocalised.jsonl"
PROGRESS_FILE = Path(__file__).parent / ".sedra_progress"

# Approximate upper bound of word IDs (found by probing)
MAX_WORD_ID = 65000
BATCH_SIZE = 20  # Concurrent requests - keep low to avoid overwhelming the server
DELAY_BETWEEN_BATCHES = 2.0  # Seconds between batches
MAX_RETRIES = 3
SAVE_EVERY = 500  # Save progress every N words


async def fetch_word(
    session: aiohttp.ClientSession, word_id: int, retries: int = 0
) -> dict | None:
    """Fetch a single word from SEDRA API with retry logic."""
    url = f"{BASE_URL}/word/{word_id}"
    try:
        async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
            if resp.status == 200:
                data = await resp.json()
                if data and len(data) > 0:
                    return data[0]
            elif resp.status == 429 and retries < MAX_RETRIES:  # Rate limited
                await asyncio.sleep(2**retries)  # Exponential backoff
                return await fetch_word(session, word_id, retries + 1)
    except asyncio.TimeoutError:
        if retries < MAX_RETRIES:
            await asyncio.sleep(1)
            return await fetch_word(session, word_id, retries + 1)
    except Exception:
        pass
    return None


async def fetch_batch(
    session: aiohttp.ClientSession, start_id: int, batch_size: int
) -> list[dict]:
    """Fetch a batch of words concurrently."""
    tasks = [fetch_word(session, i) for i in range(start_id, start_id + batch_size)]
    results = await asyncio.gather(*tasks)
    return [r for r in results if r is not None]


def check_existing_data() -> int:
    """Check if data file exists and count entries. Returns count or 0."""
    if not OUTPUT_FILE.exists():
        return 0
    try:
        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
            return sum(1 for _ in f)
    except Exception:
        return 0


def load_progress() -> tuple[int, list[dict]]:
    """Load progress from checkpoint file. Returns (start_id, accumulated_words)."""
    if not PROGRESS_FILE.exists():
        return 1, []
    try:
        with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
            data = json.load(f)
            return data.get("next_id", 1), data.get("words", [])
    except Exception:
        return 1, []


def save_progress(next_id: int, words: list[dict]):
    """Save progress to checkpoint file."""
    with open(PROGRESS_FILE, "w", encoding="utf-8") as f:
        json.dump({"next_id": next_id, "words": words}, f)


def clear_progress():
    """Remove progress file."""
    if PROGRESS_FILE.exists():
        PROGRESS_FILE.unlink()


def word_to_record(word: dict) -> dict:
    """Convert SEDRA word to our record format."""
    return {
        "syriac": word.get("syriac", ""),
        "western": word.get("western", ""),
        "eastern": word.get("eastern", ""),
        "category": word.get("category", ""),
        "gender": word.get("gender", ""),
        "number": word.get("number", ""),
        "tense": word.get("tense", ""),
        "person": word.get("person", ""),
        "glosses": word.get("glosses", {}),
    }


async def main(force: bool = False, resume: bool = True):
    """
    Fetch all vocalised words from SEDRA.

    Args:
        force: If True, re-fetch even if data exists
        resume: If True, resume from last checkpoint
    """
    # Check for existing complete data
    existing_count = check_existing_data()
    if existing_count == MAX_WORD_ID and not force:
        print(f"SEDRA data already exists with {existing_count} entries.")
        print("Use --force to re-download.")
        return

    # Check for resume
    start_id, all_words = (1, []) if force or not resume else load_progress()

    if start_id > 1:
        print(
            f"Resuming from word ID {start_id} ({len(all_words)} words already fetched)"
        )
    else:
        print(f"Fetching SEDRA vocalised lexicon (~{MAX_WORD_ID} word forms)...")

    print("Progress is saved periodically. Press Ctrl+C to pause safely.")

    connector = aiohttp.TCPConnector(limit=BATCH_SIZE)
    batch_start = start_id  # Track for interrupt handling
    try:
        async with aiohttp.ClientSession(connector=connector) as session:
            pbar = tqdm(
                total=MAX_WORD_ID,
                initial=start_id - 1,
                desc="Fetching SEDRA",
                unit="words",
            )

            for batch_start in range(start_id, MAX_WORD_ID + 1, BATCH_SIZE):
                batch = await fetch_batch(session, batch_start, BATCH_SIZE)
                all_words.extend([word_to_record(w) for w in batch])
                pbar.update(BATCH_SIZE)
                pbar.set_postfix({"found": len(all_words)})

                # Save progress periodically
                if batch_start % SAVE_EVERY < BATCH_SIZE:
                    save_progress(batch_start + BATCH_SIZE, all_words)

                await asyncio.sleep(DELAY_BETWEEN_BATCHES)

            pbar.close()

    except (KeyboardInterrupt, asyncio.CancelledError):
        print(f"\n\nPaused at word ID {batch_start}. Progress saved.")
        print("Run again to resume (or use --force to restart).")
        save_progress(batch_start, all_words)
        return

    # Save final results
    print(f"\nFetched {len(all_words)} vocalised words")

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for record in all_words:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    clear_progress()
    print(f"Saved to {OUTPUT_FILE}")


def west_to_east_vowels(western_pointed: str) -> str:
    """
    Convert West Syriac pointed text to East Syriac by replacing zqapha vowels.

    In West Syriac, zqapha (ܳ/ܴ/ܵ) represents o (the ā→o shift).
    In East Syriac, zqapha represents ā (preserves original).

    This is a heuristic fallback when no Eastern form is available.
    The consonants and other vowels remain the same.
    """
    # Zqapha vowels - the key difference between West and East
    # These represent o in West but ā in East
    # We don't need to change the Syriac characters themselves,
    # just use the same pointed text and transliterate with East rules
    return western_pointed


def generate_training_pairs():
    """
    Generate training pairs from SEDRA vocalised data.

    Creates high-quality Syriac → Latin transliteration pairs
    using the vocalised forms from SEDRA.

    Fallback strategy:
    - If Eastern form is missing, use Western form with East transliteration rules
      (which converts zqapha o → ā automatically)
    - Skip entries that have neither Western nor Eastern forms
    """
    from generate_syr_lat_pairs import (SyriacDialect,
                                        transliterate_word_pointed)

    input_file = OUTPUT_FILE
    west_output = Path(__file__).parent / "sedra_west_pairs.jsonl"
    east_output = Path(__file__).parent / "sedra_east_pairs.jsonl"

    if not input_file.exists():
        print(f"Run the fetch first: python {__file__}")
        return

    west_pairs = []
    east_pairs = []
    skipped_no_vocalised = 0

    with open(input_file, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Generating transliteration pairs"):
            record = json.loads(line)

            unpointed = record["syriac"]
            western = record["western"]
            eastern = record["eastern"]

            if not unpointed:
                continue

            # Skip entries that have neither Western nor Eastern vocalised forms
            if not western and not eastern:
                skipped_no_vocalised += 1
                continue

            # Generate West Syriac pair (using vocalised western form)
            if western:
                west_latin = transliterate_word_pointed(western, SyriacDialect.WEST)
                if west_latin and len(west_latin) > 1:
                    west_pairs.append(
                        {
                            "transliteration": {
                                "src": unpointed,  # Input: unpointed
                                "tgt": west_latin,  # Output: vocalised Latin
                                "src_pointed": western,  # Reference
                                "dialect": "west",
                                "source": "sedra",
                                "category": record.get("category", ""),
                            }
                        }
                    )

            # Generate East Syriac pair
            # If Eastern form exists, use it; otherwise fall back to Western with East rules
            if eastern:
                east_pointed = eastern
                source_note = "sedra"
            elif western:
                # Fallback: use Western pointed form but transliterate with East rules
                # This automatically converts zqapha (ā in West) to (o in East)
                east_pointed = western
                source_note = "sedra-west2east"
            else:
                continue  # Already handled above, but defensive

            east_latin = transliterate_word_pointed(east_pointed, SyriacDialect.EAST)
            if east_latin and len(east_latin) > 1:
                east_pairs.append(
                    {
                        "transliteration": {
                            "src": unpointed,  # Input: unpointed
                            "tgt": east_latin,  # Output: vocalised Latin
                            "src_pointed": east_pointed,  # Reference
                            "dialect": "east",
                            "source": source_note,
                            "category": record.get("category", ""),
                        }
                    }
                )

    # Save pairs
    with open(west_output, "w", encoding="utf-8") as f:
        for pair in west_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + "\n")

    with open(east_output, "w", encoding="utf-8") as f:
        for pair in east_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + "\n")

    # Count how many East pairs used the West→East fallback
    fallback_count = sum(
        1 for p in east_pairs if p["transliteration"]["source"] == "sedra-west2east"
    )

    print(f"\nGenerated {len(west_pairs)} West Syriac pairs → {west_output}")
    print(f"Generated {len(east_pairs)} East Syriac pairs → {east_output}")
    print(f"  (of which {fallback_count} used West→East fallback with o→ā conversion)")
    print(f"Skipped {skipped_no_vocalised} entries with no vocalised forms")


if __name__ == "__main__":
    import sys

    args = sys.argv[1:]

    if "--pairs" in args:
        generate_training_pairs()
    elif "--help" in args or "-h" in args:
        print("Usage: uv run python fetch_sedra_vocalised.py [OPTIONS]")
        print()
        print(
            "Fetches 65k (by default) vocalised Syriac word IDs from the SEDRA database."
        )
        print(
            "Download time depends on API rate limits, but usually ranges from 2-3 hours to ~7 hours."
        )
        print("Progress is saved automatically - you can resume if interrupted.")
        print()
        print("Options:")
        print("  --force    Re-fetch data even if it already exists")
        print("  --pairs    Generate training pairs from existing SEDRA data")
        print("  --help     Show this help message")
        print()
        print(
            "Tip: Run in background with: nohup uv run python fetch_sedra_vocalised.py &"
        )
    else:
        force = "--force" in args
        try:
            asyncio.run(main(force=force))
        except KeyboardInterrupt:
            print("\nInterrupted. Progress saved - run again to resume.")