| |
| """ |
| Fetch and extract Syriac text from the Digital Syriac Corpus TEI XML files. |
| |
| Source: https://github.com/srophe/syriac-corpus |
| """ |
|
|
| import os |
| import re |
| from pathlib import Path |
| from xml.etree import ElementTree as ET |
|
|
| import requests |
| from tqdm import tqdm |
|
|
| |
| TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"} |
|
|
| |
| GITHUB_RAW_BASE = "https://raw.githubusercontent.com/srophe/syriac-corpus/main/data/tei" |
|
|
| |
| OUTPUT_FILE = "syriac_corpus_tei.txt" |
|
|
|
|
| def get_corpus_file_list() -> list[int]: |
| """ |
| Get list of available corpus file numbers. |
| The corpus has numbered files (1.xml, 2.xml, etc.) |
| """ |
| |
| |
| return list(range(1, 693)) |
|
|
|
|
| def extract_syriac_from_tei(xml_content: str) -> list[str]: |
| """ |
| Extract Syriac text from TEI XML content. |
| Looks for <p xml:lang="syr"> and <ab xml:lang="syr"> elements. |
| """ |
| syriac_texts = [] |
|
|
| try: |
| root = ET.fromstring(xml_content) |
|
|
| |
| |
| for elem in root.iter(): |
| lang = elem.get("{http://www.w3.org/XML/1998/namespace}lang") |
| if lang == "syr": |
| |
| text = "".join(elem.itertext()).strip() |
| if text: |
| |
| text = re.sub(r"\s+", " ", text) |
| syriac_texts.append(text) |
| except ET.ParseError as e: |
| print(f"XML parse error: {e}") |
|
|
| return syriac_texts |
|
|
|
|
| def fetch_and_extract(file_num: int) -> list[str]: |
| """Fetch a single TEI XML file and extract Syriac text.""" |
| url = f"{GITHUB_RAW_BASE}/{file_num}.xml" |
|
|
| try: |
| response = requests.get(url, timeout=30) |
| if response.status_code == 200: |
| return extract_syriac_from_tei(response.text) |
| elif response.status_code == 404: |
| return [] |
| else: |
| print(f"Error fetching {url}: HTTP {response.status_code}") |
| return [] |
| except requests.RequestException as e: |
| print(f"Request error for {url}: {e}") |
| return [] |
|
|
|
|
| def main(): |
| print(f"Source: {GITHUB_RAW_BASE}") |
| print() |
|
|
| all_texts = [] |
| files_found = 0 |
|
|
| file_nums = get_corpus_file_list() |
|
|
| for file_num in tqdm(file_nums, desc="Fetching TEI files"): |
| texts = fetch_and_extract(file_num) |
| if texts: |
| files_found += 1 |
| all_texts.extend(texts) |
|
|
| print(f"\nFound {files_found} files with Syriac text") |
| print(f"Extracted {len(all_texts)} text segments") |
|
|
| |
| output_path = Path(__file__).parent / OUTPUT_FILE |
| with open(output_path, "w", encoding="utf-8") as f: |
| for text in all_texts: |
| f.write(text + "\n") |
|
|
| print(f"Saved to: {output_path}") |
|
|
| |
| if all_texts: |
| print("\nSample (first 3 segments):") |
| for i, text in enumerate(all_texts[:3]): |
| print(f" {i + 1}. {text[:80]}...") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|