crossroderick
/

aramt5

Text Generation

Classical Syriac

text2text-generation

transliteration

Eval Results (legacy)

Model card Files Files and versions

aramt5 / src /data /fetch_syriac_corpus.py

crossroderick's picture

Initial commit

a4462f5 3 months ago

history blame contribute delete

3.28 kB

	#!/usr/bin/env python3
	"""
	Fetch and extract Syriac text from the Digital Syriac Corpus TEI XML files.

	Source: https://github.com/srophe/syriac-corpus
	"""

	import os
	import re
	from pathlib import Path
	from xml.etree import ElementTree as ET

	import requests
	from tqdm import tqdm

	# TEI namespace
	TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"}

	# GitHub raw content base URL
	GITHUB_RAW_BASE = "https://raw.githubusercontent.com/srophe/syriac-corpus/main/data/tei"

	# Output file
	OUTPUT_FILE = "syriac_corpus_tei.txt"


	def get_corpus_file_list() -> list[int]:
	"""
	Get list of available corpus file numbers.
	The corpus has numbered files (1.xml, 2.xml, etc.)
	"""
	# Based on Aphrahat's 23 demonstrations + other texts
	# We'll try a range and handle missing files gracefully
	return list(range(1, 693)) # Try up to 692, skip missing


	def extract_syriac_from_tei(xml_content: str) -> list[str]:
	"""
	Extract Syriac text from TEI XML content.
	Looks for <p xml:lang="syr"> and <ab xml:lang="syr"> elements.
	"""
	syriac_texts = []

	try:
	root = ET.fromstring(xml_content)

	# Find all elements with xml:lang="syr"
	# Using XPath with namespace
	for elem in root.iter():
	lang = elem.get("{http://www.w3.org/XML/1998/namespace}lang")
	if lang == "syr":
	# Get all text content, including nested elements
	text = "".join(elem.itertext()).strip()
	if text:
	# Clean up whitespace
	text = re.sub(r"\s+", " ", text)
	syriac_texts.append(text)
	except ET.ParseError as e:
	print(f"XML parse error: {e}")

	return syriac_texts


	def fetch_and_extract(file_num: int) -> list[str]:
	"""Fetch a single TEI XML file and extract Syriac text."""
	url = f"{GITHUB_RAW_BASE}/{file_num}.xml"

	try:
	response = requests.get(url, timeout=30)
	if response.status_code == 200:
	return extract_syriac_from_tei(response.text)
	elif response.status_code == 404:
	return [] # File doesn't exist, skip silently
	else:
	print(f"Error fetching {url}: HTTP {response.status_code}")
	return []
	except requests.RequestException as e:
	print(f"Request error for {url}: {e}")
	return []


	def main():
	print(f"Source: {GITHUB_RAW_BASE}")
	print()

	all_texts = []
	files_found = 0

	file_nums = get_corpus_file_list()

	for file_num in tqdm(file_nums, desc="Fetching TEI files"):
	texts = fetch_and_extract(file_num)
	if texts:
	files_found += 1
	all_texts.extend(texts)

	print(f"\nFound {files_found} files with Syriac text")
	print(f"Extracted {len(all_texts)} text segments")

	# Write to output file
	output_path = Path(__file__).parent / OUTPUT_FILE
	with open(output_path, "w", encoding="utf-8") as f:
	for text in all_texts:
	f.write(text + "\n")

	print(f"Saved to: {output_path}")

	# Show sample
	if all_texts:
	print("\nSample (first 3 segments):")
	for i, text in enumerate(all_texts[:3]):
	print(f" {i + 1}. {text[:80]}...")


	if __name__ == "__main__":
	main()