aramt5 / src /data /fetch_syriac_corpus.py
crossroderick's picture
Initial commit
a4462f5
#!/usr/bin/env python3
"""
Fetch and extract Syriac text from the Digital Syriac Corpus TEI XML files.
Source: https://github.com/srophe/syriac-corpus
"""
import os
import re
from pathlib import Path
from xml.etree import ElementTree as ET
import requests
from tqdm import tqdm
# TEI namespace
TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"}
# GitHub raw content base URL
GITHUB_RAW_BASE = "https://raw.githubusercontent.com/srophe/syriac-corpus/main/data/tei"
# Output file
OUTPUT_FILE = "syriac_corpus_tei.txt"
def get_corpus_file_list() -> list[int]:
"""
Get list of available corpus file numbers.
The corpus has numbered files (1.xml, 2.xml, etc.)
"""
# Based on Aphrahat's 23 demonstrations + other texts
# We'll try a range and handle missing files gracefully
return list(range(1, 693)) # Try up to 692, skip missing
def extract_syriac_from_tei(xml_content: str) -> list[str]:
"""
Extract Syriac text from TEI XML content.
Looks for <p xml:lang="syr"> and <ab xml:lang="syr"> elements.
"""
syriac_texts = []
try:
root = ET.fromstring(xml_content)
# Find all elements with xml:lang="syr"
# Using XPath with namespace
for elem in root.iter():
lang = elem.get("{http://www.w3.org/XML/1998/namespace}lang")
if lang == "syr":
# Get all text content, including nested elements
text = "".join(elem.itertext()).strip()
if text:
# Clean up whitespace
text = re.sub(r"\s+", " ", text)
syriac_texts.append(text)
except ET.ParseError as e:
print(f"XML parse error: {e}")
return syriac_texts
def fetch_and_extract(file_num: int) -> list[str]:
"""Fetch a single TEI XML file and extract Syriac text."""
url = f"{GITHUB_RAW_BASE}/{file_num}.xml"
try:
response = requests.get(url, timeout=30)
if response.status_code == 200:
return extract_syriac_from_tei(response.text)
elif response.status_code == 404:
return [] # File doesn't exist, skip silently
else:
print(f"Error fetching {url}: HTTP {response.status_code}")
return []
except requests.RequestException as e:
print(f"Request error for {url}: {e}")
return []
def main():
print(f"Source: {GITHUB_RAW_BASE}")
print()
all_texts = []
files_found = 0
file_nums = get_corpus_file_list()
for file_num in tqdm(file_nums, desc="Fetching TEI files"):
texts = fetch_and_extract(file_num)
if texts:
files_found += 1
all_texts.extend(texts)
print(f"\nFound {files_found} files with Syriac text")
print(f"Extracted {len(all_texts)} text segments")
# Write to output file
output_path = Path(__file__).parent / OUTPUT_FILE
with open(output_path, "w", encoding="utf-8") as f:
for text in all_texts:
f.write(text + "\n")
print(f"Saved to: {output_path}")
# Show sample
if all_texts:
print("\nSample (first 3 segments):")
for i, text in enumerate(all_texts[:3]):
print(f" {i + 1}. {text[:80]}...")
if __name__ == "__main__":
main()