#!/usr/bin/env python3 """ Fetch and extract Syriac text from the Digital Syriac Corpus TEI XML files. Source: https://github.com/srophe/syriac-corpus """ import os import re from pathlib import Path from xml.etree import ElementTree as ET import requests from tqdm import tqdm # TEI namespace TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"} # GitHub raw content base URL GITHUB_RAW_BASE = "https://raw.githubusercontent.com/srophe/syriac-corpus/main/data/tei" # Output file OUTPUT_FILE = "syriac_corpus_tei.txt" def get_corpus_file_list() -> list[int]: """ Get list of available corpus file numbers. The corpus has numbered files (1.xml, 2.xml, etc.) """ # Based on Aphrahat's 23 demonstrations + other texts # We'll try a range and handle missing files gracefully return list(range(1, 693)) # Try up to 692, skip missing def extract_syriac_from_tei(xml_content: str) -> list[str]: """ Extract Syriac text from TEI XML content. Looks for

and elements. """ syriac_texts = [] try: root = ET.fromstring(xml_content) # Find all elements with xml:lang="syr" # Using XPath with namespace for elem in root.iter(): lang = elem.get("{http://www.w3.org/XML/1998/namespace}lang") if lang == "syr": # Get all text content, including nested elements text = "".join(elem.itertext()).strip() if text: # Clean up whitespace text = re.sub(r"\s+", " ", text) syriac_texts.append(text) except ET.ParseError as e: print(f"XML parse error: {e}") return syriac_texts def fetch_and_extract(file_num: int) -> list[str]: """Fetch a single TEI XML file and extract Syriac text.""" url = f"{GITHUB_RAW_BASE}/{file_num}.xml" try: response = requests.get(url, timeout=30) if response.status_code == 200: return extract_syriac_from_tei(response.text) elif response.status_code == 404: return [] # File doesn't exist, skip silently else: print(f"Error fetching {url}: HTTP {response.status_code}") return [] except requests.RequestException as e: print(f"Request error for {url}: {e}") return [] def main(): print(f"Source: {GITHUB_RAW_BASE}") print() all_texts = [] files_found = 0 file_nums = get_corpus_file_list() for file_num in tqdm(file_nums, desc="Fetching TEI files"): texts = fetch_and_extract(file_num) if texts: files_found += 1 all_texts.extend(texts) print(f"\nFound {files_found} files with Syriac text") print(f"Extracted {len(all_texts)} text segments") # Write to output file output_path = Path(__file__).parent / OUTPUT_FILE with open(output_path, "w", encoding="utf-8") as f: for text in all_texts: f.write(text + "\n") print(f"Saved to: {output_path}") # Show sample if all_texts: print("\nSample (first 3 segments):") for i, text in enumerate(all_texts[:3]): print(f" {i + 1}. {text[:80]}...") if __name__ == "__main__": main()