aramt5 / src /data /fetch_sedra_vocalised.py

Data augmentation and balancing updates for a re-run of v3

11632a3 2 months ago

12.3 kB

	"""
	Fetch vocalised Syriac word forms from SEDRA database.

	SEDRA (Syriac Electronic Data Retrieval Archive) provides:
	- Unpointed Syriac forms
	- West Syriac (Serto) vocalised forms
	- East Syriac (Madnḥaya) vocalised forms
	- Grammatical info (gender, number, tense, etc.)
	- English glosses

	Source: https://sedra.bethmardutho.org/
	"""

	import asyncio
	import json
	from pathlib import Path

	import aiohttp
	from tqdm import tqdm

	BASE_URL = "https://sedra.bethmardutho.org/api"
	OUTPUT_FILE = Path(__file__).parent / "sedra_vocalised.jsonl"
	PROGRESS_FILE = Path(__file__).parent / ".sedra_progress"

	# Approximate upper bound of word IDs (found by probing)
	MAX_WORD_ID = 65000
	BATCH_SIZE = 20 # Concurrent requests - keep low to avoid overwhelming the server
	DELAY_BETWEEN_BATCHES = 2.0 # Seconds between batches
	MAX_RETRIES = 3
	SAVE_EVERY = 500 # Save progress every N words


	async def fetch_word(
	session: aiohttp.ClientSession, word_id: int, retries: int = 0
	) -> dict \| None:
	"""Fetch a single word from SEDRA API with retry logic."""
	url = f"{BASE_URL}/word/{word_id}"
	try:
	async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
	if resp.status == 200:
	data = await resp.json()
	if data and len(data) > 0:
	return data[0]
	elif resp.status == 429 and retries < MAX_RETRIES: # Rate limited
	await asyncio.sleep(2**retries) # Exponential backoff
	return await fetch_word(session, word_id, retries + 1)
	except asyncio.TimeoutError:
	if retries < MAX_RETRIES:
	await asyncio.sleep(1)
	return await fetch_word(session, word_id, retries + 1)
	except Exception:
	pass
	return None


	async def fetch_batch(
	session: aiohttp.ClientSession, start_id: int, batch_size: int
	) -> list[dict]:
	"""Fetch a batch of words concurrently."""
	tasks = [fetch_word(session, i) for i in range(start_id, start_id + batch_size)]
	results = await asyncio.gather(*tasks)
	return [r for r in results if r is not None]


	def check_existing_data() -> int:
	"""Check if data file exists and count entries. Returns count or 0."""
	if not OUTPUT_FILE.exists():
	return 0
	try:
	with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
	return sum(1 for _ in f)
	except Exception:
	return 0


	def load_progress() -> tuple[int, list[dict]]:
	"""Load progress from checkpoint file. Returns (start_id, accumulated_words)."""
	if not PROGRESS_FILE.exists():
	return 1, []
	try:
	with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
	data = json.load(f)
	return data.get("next_id", 1), data.get("words", [])
	except Exception:
	return 1, []


	def save_progress(next_id: int, words: list[dict]):
	"""Save progress to checkpoint file."""
	with open(PROGRESS_FILE, "w", encoding="utf-8") as f:
	json.dump({"next_id": next_id, "words": words}, f)


	def clear_progress():
	"""Remove progress file."""
	if PROGRESS_FILE.exists():
	PROGRESS_FILE.unlink()


	def word_to_record(word: dict) -> dict:
	"""Convert SEDRA word to our record format."""
	return {
	"syriac": word.get("syriac", ""),
	"western": word.get("western", ""),
	"eastern": word.get("eastern", ""),
	"category": word.get("category", ""),
	"gender": word.get("gender", ""),
	"number": word.get("number", ""),
	"tense": word.get("tense", ""),
	"person": word.get("person", ""),
	"glosses": word.get("glosses", {}),
	}


	async def main(force: bool = False, resume: bool = True):
	"""
	Fetch all vocalised words from SEDRA.

	Args:
	force: If True, re-fetch even if data exists
	resume: If True, resume from last checkpoint
	"""
	# Check for existing complete data
	existing_count = check_existing_data()
	if existing_count == MAX_WORD_ID and not force:
	print(f"SEDRA data already exists with {existing_count} entries.")
	print("Use --force to re-download.")
	return

	# Check for resume
	start_id, all_words = (1, []) if force or not resume else load_progress()

	if start_id > 1:
	print(
	f"Resuming from word ID {start_id} ({len(all_words)} words already fetched)"
	)
	else:
	print(f"Fetching SEDRA vocalised lexicon (~{MAX_WORD_ID} word forms)...")

	print("Progress is saved periodically. Press Ctrl+C to pause safely.")

	connector = aiohttp.TCPConnector(limit=BATCH_SIZE)
	batch_start = start_id # Track for interrupt handling
	try:
	async with aiohttp.ClientSession(connector=connector) as session:
	pbar = tqdm(
	total=MAX_WORD_ID,
	initial=start_id - 1,
	desc="Fetching SEDRA",
	unit="words",
	)

	for batch_start in range(start_id, MAX_WORD_ID + 1, BATCH_SIZE):
	batch = await fetch_batch(session, batch_start, BATCH_SIZE)
	all_words.extend([word_to_record(w) for w in batch])
	pbar.update(BATCH_SIZE)
	pbar.set_postfix({"found": len(all_words)})

	# Save progress periodically
	if batch_start % SAVE_EVERY < BATCH_SIZE:
	save_progress(batch_start + BATCH_SIZE, all_words)

	await asyncio.sleep(DELAY_BETWEEN_BATCHES)

	pbar.close()

	except (KeyboardInterrupt, asyncio.CancelledError):
	print(f"\n\nPaused at word ID {batch_start}. Progress saved.")
	print("Run again to resume (or use --force to restart).")
	save_progress(batch_start, all_words)
	return

	# Save final results
	print(f"\nFetched {len(all_words)} vocalised words")

	with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
	for record in all_words:
	f.write(json.dumps(record, ensure_ascii=False) + "\n")

	clear_progress()
	print(f"Saved to {OUTPUT_FILE}")


	def west_to_east_vowels(western_pointed: str) -> str:
	"""
	Convert West Syriac pointed text to East Syriac by replacing zqapha vowels.

	In West Syriac, zqapha (ܳ/ܴ/ܵ) represents o (the ā→o shift).
	In East Syriac, zqapha represents ā (preserves original).

	This is a heuristic fallback when no Eastern form is available.
	The consonants and other vowels remain the same.
	"""
	# Zqapha vowels - the key difference between West and East
	# These represent o in West but ā in East
	# We don't need to change the Syriac characters themselves,
	# just use the same pointed text and transliterate with East rules
	return western_pointed


	def generate_training_pairs():
	"""
	Generate training pairs from SEDRA vocalised data.

	Creates high-quality Syriac → Latin transliteration pairs
	using the vocalised forms from SEDRA.

	Fallback strategy:
	- If Eastern form is missing, use Western form with East transliteration rules
	(which converts zqapha o → ā automatically)
	- Skip entries that have neither Western nor Eastern forms
	"""
	from generate_syr_lat_pairs import (SyriacDialect,
	transliterate_word_pointed)

	input_file = OUTPUT_FILE
	west_output = Path(__file__).parent / "sedra_west_pairs.jsonl"
	east_output = Path(__file__).parent / "sedra_east_pairs.jsonl"

	if not input_file.exists():
	print(f"Run the fetch first: python {__file__}")
	return

	west_pairs = []
	east_pairs = []
	skipped_no_vocalised = 0

	with open(input_file, "r", encoding="utf-8") as f:
	for line in tqdm(f, desc="Generating transliteration pairs"):
	record = json.loads(line)

	unpointed = record["syriac"]
	western = record["western"]
	eastern = record["eastern"]

	if not unpointed:
	continue

	# Skip entries that have neither Western nor Eastern vocalised forms
	if not western and not eastern:
	skipped_no_vocalised += 1
	continue

	# Generate West Syriac pair (using vocalised western form)
	if western:
	west_latin = transliterate_word_pointed(western, SyriacDialect.WEST)
	if west_latin and len(west_latin) > 1:
	west_pairs.append(
	{
	"transliteration": {
	"src": unpointed, # Input: unpointed
	"tgt": west_latin, # Output: vocalised Latin
	"src_pointed": western, # Reference
	"dialect": "west",
	"source": "sedra",
	"category": record.get("category", ""),
	}
	}
	)

	# Generate East Syriac pair
	# If Eastern form exists, use it; otherwise fall back to Western with East rules
	if eastern:
	east_pointed = eastern
	source_note = "sedra"
	elif western:
	# Fallback: use Western pointed form but transliterate with East rules
	# This automatically converts zqapha (ā in West) to (o in East)
	east_pointed = western
	source_note = "sedra-west2east"
	else:
	continue # Already handled above, but defensive

	east_latin = transliterate_word_pointed(east_pointed, SyriacDialect.EAST)
	if east_latin and len(east_latin) > 1:
	east_pairs.append(
	{
	"transliteration": {
	"src": unpointed, # Input: unpointed
	"tgt": east_latin, # Output: vocalised Latin
	"src_pointed": east_pointed, # Reference
	"dialect": "east",
	"source": source_note,
	"category": record.get("category", ""),
	}
	}
	)

	# Save pairs
	with open(west_output, "w", encoding="utf-8") as f:
	for pair in west_pairs:
	f.write(json.dumps(pair, ensure_ascii=False) + "\n")

	with open(east_output, "w", encoding="utf-8") as f:
	for pair in east_pairs:
	f.write(json.dumps(pair, ensure_ascii=False) + "\n")

	# Count how many East pairs used the West→East fallback
	fallback_count = sum(
	1 for p in east_pairs if p["transliteration"]["source"] == "sedra-west2east"
	)

	print(f"\nGenerated {len(west_pairs)} West Syriac pairs → {west_output}")
	print(f"Generated {len(east_pairs)} East Syriac pairs → {east_output}")
	print(f" (of which {fallback_count} used West→East fallback with o→ā conversion)")
	print(f"Skipped {skipped_no_vocalised} entries with no vocalised forms")


	if __name__ == "__main__":
	import sys

	args = sys.argv[1:]

	if "--pairs" in args:
	generate_training_pairs()
	elif "--help" in args or "-h" in args:
	print("Usage: uv run python fetch_sedra_vocalised.py [OPTIONS]")
	print()
	print(
	"Fetches 65k (by default) vocalised Syriac word IDs from the SEDRA database."
	)
	print(
	"Download time depends on API rate limits, but usually ranges from 2-3 hours to ~7 hours."
	)
	print("Progress is saved automatically - you can resume if interrupted.")
	print()
	print("Options:")
	print(" --force Re-fetch data even if it already exists")
	print(" --pairs Generate training pairs from existing SEDRA data")
	print(" --help Show this help message")
	print()
	print(
	"Tip: Run in background with: nohup uv run python fetch_sedra_vocalised.py &"
	)
	else:
	force = "--force" in args
	try:
	asyncio.run(main(force=force))
	except KeyboardInterrupt:
	print("\nInterrupted. Progress saved - run again to resume.")