#!/usr/bin/env python3
"""
Fetch and extract Syriac text from the Digital Syriac Corpus TEI XML files.

Source: https://github.com/srophe/syriac-corpus
"""

import os
import re
from pathlib import Path
from xml.etree import ElementTree as ET

import requests
from tqdm import tqdm

# TEI namespace
TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"}

# GitHub raw content base URL
GITHUB_RAW_BASE = "https://raw.githubusercontent.com/srophe/syriac-corpus/main/data/tei"

# Output file
OUTPUT_FILE = "syriac_corpus_tei.txt"


def get_corpus_file_list() -> list[int]:
    """
    Get list of available corpus file numbers.
    The corpus has numbered files (1.xml, 2.xml, etc.)
    """
    # Based on Aphrahat's 23 demonstrations + other texts
    # We'll try a range and handle missing files gracefully
    return list(range(1, 693))  # Try up to 692, skip missing


def extract_syriac_from_tei(xml_content: str) -> list[str]:
    """
    Extract Syriac text from TEI XML content.
    Looks for <p xml:lang="syr"> and <ab xml:lang="syr"> elements.
    """
    syriac_texts = []

    try:
        root = ET.fromstring(xml_content)

        # Find all elements with xml:lang="syr"
        # Using XPath with namespace
        for elem in root.iter():
            lang = elem.get("{http://www.w3.org/XML/1998/namespace}lang")
            if lang == "syr":
                # Get all text content, including nested elements
                text = "".join(elem.itertext()).strip()
                if text:
                    # Clean up whitespace
                    text = re.sub(r"\s+", " ", text)
                    syriac_texts.append(text)
    except ET.ParseError as e:
        print(f"XML parse error: {e}")

    return syriac_texts


def fetch_and_extract(file_num: int) -> list[str]:
    """Fetch a single TEI XML file and extract Syriac text."""
    url = f"{GITHUB_RAW_BASE}/{file_num}.xml"

    try:
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            return extract_syriac_from_tei(response.text)
        elif response.status_code == 404:
            return []  # File doesn't exist, skip silently
        else:
            print(f"Error fetching {url}: HTTP {response.status_code}")
            return []
    except requests.RequestException as e:
        print(f"Request error for {url}: {e}")
        return []


def main():
    print(f"Source: {GITHUB_RAW_BASE}")
    print()

    all_texts = []
    files_found = 0

    file_nums = get_corpus_file_list()

    for file_num in tqdm(file_nums, desc="Fetching TEI files"):
        texts = fetch_and_extract(file_num)
        if texts:
            files_found += 1
            all_texts.extend(texts)

    print(f"\nFound {files_found} files with Syriac text")
    print(f"Extracted {len(all_texts)} text segments")

    # Write to output file
    output_path = Path(__file__).parent / OUTPUT_FILE
    with open(output_path, "w", encoding="utf-8") as f:
        for text in all_texts:
            f.write(text + "\n")

    print(f"Saved to: {output_path}")

    # Show sample
    if all_texts:
        print("\nSample (first 3 segments):")
        for i, text in enumerate(all_texts[:3]):
            print(f"  {i + 1}. {text[:80]}...")


if __name__ == "__main__":
    main()