Spaces:

andrehoffmann80
/

DOI

Sleeping

File size: 41,608 Bytes

import datetime
from urllib.parse import quote
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from lxml import etree
import streamlit as st

# =====================================================================
# Networking Configuration
# =====================================================================

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept": "application/xml, text/xml, */*"
}

def get_robust_session():
    """Returns a requests session with retry logic for transient errors (500, 502, 503, 504)."""
    session = requests.Session()
    retry_strategy = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

ROBUST_SESSION = get_robust_session()

# =====================================================================
# Namespaces
# =====================================================================

CROSSREF_NS = "http://www.crossref.org/schema/4.4.2"
XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1"
XML_NS = "http://www.w3.org/XML/1998/namespace"
AI_NS = "http://www.crossref.org/AccessIndicators.xsd"
MODS_NS = "http://www.loc.gov/mods/v3"
XML_LANG = f"{{{XML_NS}}}lang"


# =====================================================================
# Hilfsfunktionen
# =====================================================================

def clean_text(text: str) -> str:
    """Bereinigt Soft-Hyphen, PDF-Trennungen, Zeilenumbrüche – sonst unverändert."""
    if not text:
        return ""
    return (
        text.replace("\u00AD", "")   # Soft Hyphen
            .replace("", "")       # alternative Soft Hyphen
            .replace("\n", " ")
    ).strip()


def get_text(node, xpath, ns):
    elem = node.find(xpath, namespaces=ns)
    return clean_text(elem.text) if elem is not None and elem.text else ""


def build_dora_mods_url(repo_code: str, object_or_url: str, host: str = "www.dora.lib4ri.ch") -> list[str]:
    """
    Erzeugt eine Liste von möglichen MODS-URLs (Direct Datastream & OAI-PMH) 
    aus einer DORA-ID wie 'wsl:41900'.
    """
    if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
        return [object_or_url]
    
    # ID bereinigen
    clean_id = object_or_url.strip()
    # Für URLs muss die ID encodiert werden (z.B. : zu %3A)
    encoded_id = quote(clean_id)
    oai_id = clean_id.replace(":", "_")
    
    # Variante 1: Direct Datastream (oft stabiler gegen IP-Sperren)
    direct_url = f"https://{host}/{repo_code}/islandora/object/{encoded_id}/datastream/MODS/view"
    
    # Variante 2: OAI-PMH
    oai_url = f"https://{host}/{repo_code}/oai2?verb=GetRecord&identifier=oai:dora:{oai_id}&metadataPrefix=mods"
    
    return [direct_url, oai_url]


def build_persistent_url(repo_code: str, object_id: str) -> str:
    """
    Erzeugt die neue persistente URL im Format:
    https://www.dora.lib4ri.ch/{repo}/item/{id}
    """
    # Force public domain for persistent links
    public_base = "https://www.dora.lib4ri.ch"
    return f"{public_base}/{repo_code}/item/{object_id}"


def fetch_mods_xml(mods_urls: list[str]) -> etree._Element:
    """Probiert eine Liste von URLs durch, bis ein MODS-XML erfolgreich geladen wird."""
    errors = []
    
    for url in mods_urls:
        # Versuch 1: Mit Browser-Headern
        # Versuch 2: Ohne spezielle Header (falls der Server picky ist)
        for use_headers in [True, False]:
            current_headers = HEADERS if use_headers else {}
            header_label = "Browser-Header" if use_headers else "Standard-Header"
            
            try:
                resp = ROBUST_SESSION.get(url, headers=current_headers, timeout=30)
                if resp.status_code != 200:
                    errors.append(f"FAILED ({header_label}): {url} -> HTTP {resp.status_code}")
                    continue
                
                # Use recover=True to handle malformed XML
                parser = etree.XMLParser(recover=True, remove_blank_text=True)
                root = etree.fromstring(resp.content, parser=parser)
                
                # Falls es sich um eine OAI-PMH Antwort handelt, extrahiere den <mods:mods> Knoten
                if "OAI-PMH" in root.tag or root.tag.endswith("OAI-PMH"):
                    ns = {
                        "oai": "http://www.openarchives.org/OAI/2.0/", 
                        "mods": "http://www.loc.gov/mods/v3"
                    }
                    mods_node = root.find(".//mods:mods", namespaces=ns)
                    if mods_node is not None:
                        return mods_node
                    else:
                        errors.append(f"EMPTY: {url} (OAI-PMH Antwort enthielt kein <mods:mods>)")
                else:
                    # Es ist bereits das MODS Element oder ein MODS-ähnlicher Knoten
                    if "mods" in root.tag.lower():
                        return root
                    errors.append(f"UNRECOGNIZED: {url} (Wurzel-Element '{root.tag}' ist kein MODS)")
            except Exception as e:
                errors.append(f"ERROR ({header_label}): {url} -> {str(e)}")
                continue
            
    # Wenn wir hier ankommen, sind alle Versuche fehlgeschlagen
    error_summary = "\n".join(errors)
    st.error("### Ernte-Bericht (Alle Versuche fehlgeschlagen)")
    for err in errors:
        st.write(f"- {err}")
        
    raise ValueError("Metadaten konnten von keiner der verfügbaren URLs geladen werden.")


def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
    """Extrahiert Buch-Metadaten aus einem Buch-MODS-Record."""
    ns = book_root.nsmap.copy()
    if "mods" not in ns:
        ns["mods"] = MODS_NS

    # Buchtitel
    book_title = get_text(book_root, ".//mods:titleInfo/mods:title", ns)

    # Serie (falls vorhanden)
    series_title = get_text(
        book_root,
        ".//mods:relatedItem[@type='series']/mods:titleInfo/mods:title",
        ns
    )
    series_issn = get_text(
        book_root,
        ".//mods:relatedItem[@type='series']/mods:identifier[@type='issn']",
        ns
    )

    # Herausgeber (editor) & Autoren (author - für Monographs)
    editors = []
    authors = []
    for name in book_root.findall(".//mods:name[@type='personal']", ns):
        role = name.find("mods:role/mods:roleTerm", ns)
        if role is not None:
            role_text = role.text.lower()
            if role_text == "editor":
                given = get_text(name, "mods:namePart[@type='given']", ns)
                family = get_text(name, "mods:namePart[@type='family']", ns)
                editors.append({"given": given, "family": family})
            elif role_text == "author":
                given = get_text(name, "mods:namePart[@type='given']", ns)
                family = get_text(name, "mods:namePart[@type='family']", ns)
                # Authors at book level (for Monographs)
                authors.append({"given": given, "family": family})

    # Publisher
    publisher_name = get_text(book_root, ".//mods:originInfo/mods:publisher", ns)

    # Publikationsjahr (online)
    pub_year = get_text(
        book_root,
        ".//mods:originInfo/mods:dateIssued[@encoding='w3cdtf'][@keyDate='yes']",
        ns
    )
    if not pub_year:
        pub_year = get_text(book_root, ".//mods:originInfo/mods:dateIssued", ns)

    # DOI & URI
    book_doi = get_text(book_root, ".//mods:identifier[@type='doi']", ns)
    
    # Persistent URL format
    # Example: https://www.dora.lib4ri.ch/psi/item/psi:84778
    book_id = get_text(book_root, ".//mods:identifier[@type='local']", ns)
    if not book_id:
        # Fallback to building ID from DOI if possible, or use a placeholder
        book_id = book_doi.split("/")[-1] if book_doi else ""
    
    # Get repo_code from the ID itself (e.g. 'psi' from 'psi:84778')
    current_repo = book_id.split(":")[0] if ":" in book_id else repo_base_url.split("/")[-1]
    book_resource = build_persistent_url(current_repo, book_id) if book_id else ""

    # ISBN / noisbn
    isbn_val = get_text(book_root, ".//mods:identifier[@type='isbn']", ns)
    noisbn_reason = "archive_volume" if not isbn_val else None

    # Default to current date if not found/provided
    today = datetime.date.today()

    meta = {
        "book_title": book_title,
        "series_title": series_title or "",
        "series_issn": series_issn or "",
        "publisher_name": publisher_name,
        "pub_year": int(pub_year[:4]) if pub_year else today.year,
        "pub_month": str(today.month),
        "pub_day": str(today.day),
        "noisbn_reason": noisbn_reason or "",
        "book_doi": book_doi or "",
        "book_resource": book_resource or "",
        "report_number": "", 
        "editors": editors,
        "authors": authors,
    }
    return meta


def mods_to_content_item(mods_root: etree._Element, repo_base_url: str) -> tuple[etree._Element, int]:
    """Wandelt ein Kapitel-MODS in ein Crossref <content_item> um."""
    ns = mods_root.nsmap.copy()
    if "mods" not in ns:
        ns["mods"] = MODS_NS

    title = get_text(mods_root, ".//mods:titleInfo/mods:title", ns)
    doi = get_text(mods_root, ".//mods:identifier[@type='doi']", ns)
    year = get_text(mods_root, ".//mods:originInfo/mods:dateIssued", ns)
    abstract = get_text(mods_root, ".//mods:abstract", ns)
    first_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:start", ns)
    last_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:end", ns)

    # Autoren
    authors = []
    for name in mods_root.findall(".//mods:name[@type='personal']", ns):
        role = name.find("mods:role/mods:roleTerm", ns)
        if role is not None and role.text == "author":
            given = get_text(name, "mods:namePart[@type='given']", ns)
            family = get_text(name, "mods:namePart[@type='family']", ns)
            authors.append((given, family))

    ci = etree.Element("content_item", component_type="chapter")

    # Contributors
    contribs = etree.SubElement(ci, "contributors")
    for idx, (given, family) in enumerate(authors):
        pn = etree.SubElement(
            contribs,
            "person_name",
            sequence="first" if idx == 0 else "additional",
            contributor_role="author",
        )
        etree.SubElement(pn, "given_name").text = given
        etree.SubElement(pn, "surname").text = family

    # Titel
    titles = etree.SubElement(ci, "titles")
    etree.SubElement(titles, "title").text = title

    # Abstract (JATS)
    jats_abs = etree.SubElement(ci, f"{{{JATS_NS}}}abstract", {XML_LANG: "en"})
    p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p")
    p.text = abstract

    # Publikationsdatum
    pub = etree.SubElement(ci, "publication_date", media_type="online")
    if year:
        etree.SubElement(pub, "year").text = year[:4]

    # Seiten
    if first_page or last_page:
        pages = etree.SubElement(ci, "pages")
        if first_page:
            etree.SubElement(pages, "first_page").text = first_page
        if last_page:
            etree.SubElement(pages, "last_page").text = last_page

    # License information (AccessIndicators) - must come before doi_data
    ai_program = etree.SubElement(ci, f"{{{AI_NS}}}program", name="AccessIndicators")
    license_ref = etree.SubElement(ai_program, f"{{{AI_NS}}}license_ref")
    license_ref.text = "https://creativecommons.org/licenses/by/4.0/"
    license_ref.set("applies_to", "vor")
    license_ref.set("start_date", year[:4] + "-01-01" if year else "")

    # DOI
    if doi:
        doi_data = etree.SubElement(ci, "doi_data")
        etree.SubElement(doi_data, "doi").text = doi
        
        # New persistent URL format
        chapter_id = doi.split("/")[-1] if "/" in doi else doi
        repo_code_extracted = chapter_id.split(":")[0] if ":" in chapter_id else repo_base_url.split("/")[-1]
        etree.SubElement(
            doi_data,
            "resource"
        ).text = build_persistent_url(repo_code_extracted, chapter_id)

    # Sortierung nach first_page
    try:
        page_number = int(first_page)
    except Exception:
        page_number = 999999

    return ci, page_number


def build_doi_batch_xml(
    book_meta: dict,
    depositor_meta: dict,
    chapter_items: list[tuple[etree._Element, int]],
    book_type: str = "edited_book",
) -> bytes:
    """
    Erzeugt Crossref-<doi_batch>.
    book_type: 'edited_book', 'monograph', oder 'report-paper' (custom internal flag).
    """
    doi_batch = etree.Element(
        "doi_batch",
        nsmap={
            None: CROSSREF_NS,
            "xsi": XSI_NS,
            "jats": JATS_NS,
            "ai": AI_NS,
        }
    )
    doi_batch.set("version", "4.4.2")
    doi_batch.set(
        f"{{{XSI_NS}}}schemaLocation",
        "http://www.crossref.org/schema/4.4.2 "
        "http://www.crossref.org/schema/deposit/crossref4.4.2.xsd"
    )

    # HEAD
    head = etree.SubElement(doi_batch, "head")
    etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]

    ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M%S")
    etree.SubElement(head, "timestamp").text = ts

    depositor = etree.SubElement(head, "depositor")
    etree.SubElement(depositor, "depositor_name").text = depositor_meta["depositor_name"]
    etree.SubElement(depositor, "email_address").text = depositor_meta["depositor_email"]

    etree.SubElement(head, "registrant").text = depositor_meta["registrant"]

    # BODY
    body = etree.SubElement(doi_batch, "body")

    # Determine structure based on book_type
    if book_type == "report-paper":
        report_paper = etree.SubElement(body, "report-paper")
        # Decide between report-paper_metadata and report-paper_series_metadata
        has_series = (book_meta.get("series_title") or book_meta.get("series_issn"))
        if has_series:
            metadata_root = etree.SubElement(report_paper, "report-paper_series_metadata")
            # 1. SERIES METADATA (Required if using report-paper_series_metadata)
            series_metadata = etree.SubElement(metadata_root, "series_metadata")
            if book_meta.get("series_title"):
                stitles = etree.SubElement(series_metadata, "titles")
                etree.SubElement(stitles, "title").text = book_meta["series_title"]
            if book_meta.get("series_issn"):
                etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
        else:
            metadata_root = etree.SubElement(report_paper, "report-paper_metadata")
    else:
        # BOOK STRUCTURE (Edited Book or Monograph)
        book = etree.SubElement(body, "book", book_type=book_type)
        # If it's a monograph or edited book, we often use book_series_metadata or book_metadata
        # For simplicity and to match the schema, let's stick to book_series_metadata if series exists
        if book_meta.get("series_title") or book_meta.get("series_issn"):
            metadata_root = etree.SubElement(book, "book_series_metadata")
            series_metadata = etree.SubElement(metadata_root, "series_metadata")
            if book_meta.get("series_title"):
                stitles = etree.SubElement(series_metadata, "titles")
                etree.SubElement(stitles, "title").text = book_meta["series_title"]
            if book_meta.get("series_issn"):
                etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
        else:
            metadata_root = etree.SubElement(book, "book_metadata")

    # 2. CONTRIBUTORS
    # Contributors (Editors or Authors)
    contributors_list = []
    role = "editor"
    if book_type in ["monograph", "report-paper"]:
        contributors_list = book_meta.get("authors", [])
        role = "author"
    else:
        contributors_list = book_meta.get("editors", [])
        role = "editor"

    if contributors_list:
        contribs = etree.SubElement(metadata_root, "contributors")
        for idx, person in enumerate(contributors_list):
            pn = etree.SubElement(
                contribs,
                "person_name",
                sequence="first" if idx == 0 else "additional",
                contributor_role=role
            )
            etree.SubElement(pn, "given_name").text = person["given"]
            etree.SubElement(pn, "surname").text = person["family"]

    # 3. TITLES
    titles = etree.SubElement(metadata_root, "titles")
    etree.SubElement(titles, "title").text = book_meta["book_title"]

    # 4. PUBLICATION DATE
    pub = etree.SubElement(metadata_root, "publication_date", media_type="online")
    if book_meta.get("pub_month") and book_meta.get("pub_month").strip():
        try:
            etree.SubElement(pub, "month").text = f"{int(book_meta['pub_month']):02d}"
        except ValueError:
            pass
    if book_meta.get("pub_day") and book_meta.get("pub_day").strip():
        try:
            etree.SubElement(pub, "day").text = f"{int(book_meta['pub_day']):02d}"
        except ValueError:
            pass
    etree.SubElement(pub, "year").text = str(book_meta["pub_year"])

    # 5. NOISBN (only for books)
    if book_type != "report-paper":
        if book_meta.get("noisbn_reason"):
            etree.SubElement(metadata_root, "noisbn", reason=book_meta["noisbn_reason"])

    # 6. PUBLISHER
    pub_node = etree.SubElement(metadata_root, "publisher")
    etree.SubElement(pub_node, "publisher_name").text = book_meta["publisher_name"]

    # 7. PUBLISHER ITEM (Report Number) - Only for report-paper
    if book_type == "report-paper" and book_meta.get("report_number"):
        publisher_item = etree.SubElement(metadata_root, "publisher_item")
        etree.SubElement(publisher_item, "identifier", id_type="report-number").text = book_meta["report_number"]

    # 8. DOI DATA
    if book_meta.get("book_doi") or book_meta.get("book_resource"):
        doi_data = etree.SubElement(metadata_root, "doi_data")
        if book_meta.get("book_doi"):
            etree.SubElement(doi_data, "doi").text = book_meta["book_doi"]
        if book_meta.get("book_resource"):
            etree.SubElement(doi_data, "resource").text = book_meta["book_resource"]

    # 10. COMPONENTS (Chapters)
    # Sort and append chapters
    if book_type != "report-paper": 
         # For books, chapters are children of <book> node
         # But wait, in the loop below we append to 'book' variable.
         # 'book' variable is only defined if book_type != 'report-paper'.
         pass
    
    chapter_items.sort(key=lambda x: x[1])
    for ci, _page in chapter_items:
        if book_type == "report-paper":
            report_paper.append(ci)
        else:
            book.append(ci)

    xml_bytes = etree.tostring(
        doi_batch,
        pretty_print=True,
        encoding="UTF-8",
        xml_declaration=True
    )
    return xml_bytes


class CrossrefSchemaResolver(etree.Resolver):
    """Custom resolver to fetch included XSD schemas from Crossref and W3C."""

    def resolve(self, url, id, context):
        # Map of known schema locations
        schema_map = {
            'mathml3-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-content.xsd',
            'mathml3-presentation.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-presentation.xsd',
            'mathml3-strict-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-strict-content.xsd',
            'mathml3-common.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-common.xsd',
        }

        # Determine the URL to fetch
        if url.startswith("http://") or url.startswith("https://"):
            schema_url = url
        elif url in schema_map:
            schema_url = schema_map[url]
        else:
            schema_url = f"https://www.crossref.org/schemas/{url}"

        try:
            response = ROBUST_SESSION.get(schema_url, headers=HEADERS, timeout=30)
            response.raise_for_status()
            return self.resolve_string(response.content, context)
        except Exception:
            # If fetching fails, return None to use default behavior
            return None


def validate_crossref_xml(xml_bytes: bytes) -> tuple[bool, list[str]]:
    """
    Validiert Crossref XML gegen das offizielle XSD Schema.

    Returns:
        tuple: (is_valid, error_messages)
    """
    errors = []

    try:
        # Parse XML
        doc = etree.fromstring(xml_bytes)

        # Crossref XSD Schema URL
        schema_url = "https://www.crossref.org/schemas/crossref4.4.2.xsd"

        # Download schema (mit Caching in Session State für Performance)
        if 'crossref_schema' not in st.session_state:
            try:
                # Create parser with custom resolver
                parser = etree.XMLParser()
                parser.resolvers.add(CrossrefSchemaResolver())

                # Download main schema
                schema_resp = ROBUST_SESSION.get(schema_url, headers=HEADERS, timeout=60)
                schema_resp.raise_for_status()

                # Parse schema with resolver
                schema_doc = etree.fromstring(schema_resp.content, parser)
                st.session_state.crossref_schema = etree.XMLSchema(schema_doc)
            except Exception as e:
                errors.append(f"Fehler beim Laden des XSD Schemas: {e}")
                return False, errors

        schema = st.session_state.crossref_schema

        # Validierung
        is_valid = schema.validate(doc)

        if not is_valid:
            for error in schema.error_log:
                errors.append(f"Zeile {error.line}: {error.message}")

        return is_valid, errors

    except etree.XMLSyntaxError as e:
        errors.append(f"XML Syntax Fehler: {e}")
        return False, errors
    except Exception as e:
        errors.append(f"Unerwarteter Fehler: {e}")
        return False, errors


# =====================================================================
# REPOSITORY CONFIGURATION
# =====================================================================

REPO_CONFIG = {
    "wsl": {
        "publisher": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
        "registrant": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
        "prefix": "10.55419",
        "role": "wslx"
    },
    "psi": {
        "publisher": "Paul Scherrer Institute, PSI",
        "registrant": "Paul Scherrer Institute, PSI",
        "prefix": "10.55402",
        "role": "psit"
    },
    "empa": {
        "publisher": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
        "registrant": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
        "prefix": "10.55368",
        "role": "empa"
    },
    "eawag": {
        "publisher": "Swiss Federal Institute of Aquatic Science and Technology, Eawag", 
        "registrant": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
        "prefix": "10.55408",
        "role": "eawa"
    }
}

def main():
    st.title("Crossref XML Generator/Uploader")

    st.markdown(
        "Dieses Dashboard lädt **MODS-Metadaten direkt aus DORA** mittels IDs "
        "und erzeugt ein vollständiges Crossref-XML (`doi_batch`) für Reports (WSL Berichte und PSI Berichte) und Edited Books/Conference Proceedings."
    )

    st.subheader("Konfiguration & Quelle")

    col_config, col_source = st.columns(2)
    
    with col_config:
        st.markdown("#### Verbindung & Typ")
        
        dora_host = st.selectbox(
            "DORA Host",
            options=["www.dora.lib4ri.ch", "admin.dora.lib4ri.ch"],
            index=0,
            help="Nutze 'www' für externen Zugriff (Hugging Face) und 'admin' für institutsweiten Zugriff."
        )
        
        repo_list = list(REPO_CONFIG.keys())
        repo_code = st.selectbox(
            "Repository-Code",
            options=repo_list,
            index=0, 
            format_func=lambda x: x.upper()
        )
        
        repo_config = REPO_CONFIG[repo_code]
        repo_base_url = f"https://{dora_host}/{repo_code}"
        
        pub_type = st.radio(
            "Publikationstyp",
            ("Edited Book", "Report (WSL, Monograph Series)", "Report (Eawag, PSI, Paper Series)"),
            horizontal=False
        )
    
    # Mapping auf Crossref book_type / report type
    cr_book_type = "edited_book"
    if "Monograph" in pub_type:
        cr_book_type = "monograph"
    elif "Paper Series" in pub_type:
        cr_book_type = "report-paper"

    with col_source:
        st.markdown("#### MODS-Quelle")
        # Dynamic default ID based on repo
        default_id = "41891"
        if repo_code == "psi":
            default_id = "84057"
        
        book_id_or_url = st.text_input(
            "DORA-ID oder MODS-URL",
            value=f"{repo_code}:{default_id}",
            help="Beispiel: wsl:41900 oder komplette URL"
        )
        
        st.write("") # Spacer
        if st.button("Metadaten laden", type="primary"):
            try:
                mods_urls = build_dora_mods_url(repo_code, book_id_or_url, host=dora_host)
                st.info(f"Suche MODS-Metadaten auf: {dora_host}...")
                book_root = fetch_mods_xml(mods_urls)
                meta = parse_book_mods(book_root, repo_base_url)
                
                # --- Attempt to extract report number from MODS ---
                ns = book_root.nsmap.copy()
                if "mods" not in ns:
                    ns["mods"] = MODS_NS
                report_num = get_text(book_root, ".//mods:identifier[@type='report number']", ns)
                if not report_num:
                     report_num = get_text(book_root, ".//mods:identifier[@type='report-number']", ns)
                
                if not report_num:
                     # Check <note type="report number">
                     report_num = get_text(book_root, ".//mods:note[@type='report number']", ns)
    
                if report_num:
                     meta["report_number"] = report_num
                     st.info(f"Report Number gefunden: {report_num}")
                # --------------------------------------------------
    
                # Update flat fields in session state for widgets
                for k, v in meta.items():
                    if k in ["book_title", "series_title", "series_issn", "publisher_name", 
                            "pub_year", "pub_month", "pub_day", "noisbn_reason", 
                            "book_doi", "book_resource", "report_number"]:
                        st.session_state[k] = v
                    st.session_state.book_meta[k] = v
                
                # Special handling for persons text area
                if cr_book_type in ["monograph", "report-paper"]:
                    current_list = meta.get("authors", [])
                else:
                    current_list = meta.get("editors", [])
                st.session_state["persons_input"] = "\n".join(f"{e['given']};{e['family']}" for e in current_list)

                st.session_state.book_meta_loaded = True
                st.success("Metadaten erfolgreich geladen.")
                st.rerun()
            except Exception as e:
                st.error(f"Fehler beim Laden der MODS: {e}")
                import traceback
                st.text(traceback.format_exc())

    # Session State Init Logic (unchanged but placed after UI definition for clarity in reading flow, strictly it runs before inputs generally)
    if "book_meta_loaded" not in st.session_state:
        st.session_state.book_meta_loaded = False

    # Current date for defaults
    today = datetime.date.today()

    # Initialize session state keys for widgets if not present
    if "book_title" not in st.session_state:
        st.session_state.book_title = ""
    if "series_title" not in st.session_state:
        st.session_state.series_title = ""
    if "series_issn" not in st.session_state:
        st.session_state.series_issn = ""
    if "publisher_name" not in st.session_state:
        st.session_state.publisher_name = repo_config["publisher"]
    if "pub_year" not in st.session_state:
        st.session_state.pub_year = today.year
    if "pub_month" not in st.session_state:
        st.session_state.pub_month = str(today.month)
    if "pub_day" not in st.session_state:
        st.session_state.pub_day = str(today.day)
    if "noisbn_reason" not in st.session_state:
        st.session_state.noisbn_reason = ""
    if "book_doi" not in st.session_state:
        st.session_state.book_doi = ""
    if "book_resource" not in st.session_state:
        st.session_state.book_resource = ""
    if "report_number" not in st.session_state:
        st.session_state.report_number = ""
    if "persons_input" not in st.session_state:
        st.session_state.persons_input = ""

    if "book_meta" not in st.session_state:
        st.session_state.book_meta = {
            "book_title": "",
            "series_title": "",
            "series_issn": "",
            "publisher_name": repo_config["publisher"],
            "pub_year": today.year,
            "pub_month": str(today.month),
            "pub_day": str(today.day),
            "noisbn_reason": "",
            "book_doi": "",
            "book_resource": "",
            "report_number": "", 
            "editors": [],
            "authors": [],
        }
        
    # CHECK: has the repo code changed since last run?
    if "last_repo_code" not in st.session_state:
        st.session_state.last_repo_code = repo_code
        st.session_state.registrant = repo_config["registrant"]
        st.session_state.cr_role = repo_config.get("role", "")
    
    if st.session_state.last_repo_code != repo_code:
        # Repo changed! Update defaults
        st.session_state.publisher_name = repo_config["publisher"]
        st.session_state.book_meta["publisher_name"] = repo_config["publisher"]
        st.session_state.registrant = repo_config["registrant"]
        
        # If the user hasn't typed anything yet or if we force update?
        # Let's force update the role in session state so the input widget picks it up
        st.session_state.cr_role = repo_config.get("role", "")
        
        st.session_state.last_repo_code = repo_code

    st.markdown("---")
    st.subheader("Metadaten & Inhalte")
    
    # Use expander for metadata editing to keep UI clean
    with st.expander("Metadaten bearbeiten", expanded=True):
        bm = st.session_state.book_meta
    
        col_b1, col_b2 = st.columns(2)
        with col_b1:
            st.text_input("Titel", key="book_title")
            st.text_input("Serientitel", key="series_title")
            st.text_input("Serien-ISSN", key="series_issn")
            st.text_input("Publisher Name", key="publisher_name")
            
            if cr_book_type == "report-paper":
                st.text_input("Report Number", key="report_number")
    
        with col_b2:
            c_y, c_m, c_d = st.columns(3)
            with c_y:
                st.number_input("Jahr", min_value=1900, max_value=2100, key="pub_year")
            with c_m:
                st.text_input("Monat", key="pub_month")
            with c_d:
                st.text_input("Tag", key="pub_day")
            
            if cr_book_type != "report-paper":
                 st.text_input("noisbn reason", key="noisbn_reason")
    
        st.markdown("##### Identifikatoren")
        col_id1, col_id2 = st.columns(2)
        with col_id1:
             st.text_input("DOI", key="book_doi")
        with col_id2:
             st.text_input("Resource URL", key="book_resource")
             
        st.caption(f"Basis DOI Prefix: {repo_config['prefix']}")
    
        st.markdown("##### Mitwirkende")
        # Decide label based on type
        if cr_book_type in ["monograph", "report-paper"]:
            st.info("Bitte **Autoren** eintragen (Vorname;Nachname).")
            label = "Autoren"
        else:
            st.info("Bitte **Editoren** eintragen (Vorname;Nachname).")
            label = "Editoren"
    
        persons_text = st.text_area(label, key="persons_input", height=100)
    
        # Parse and save back
        new_persons = []
        for line in persons_text.splitlines():
            line = line.strip()
            if not line:
                continue
            parts = [p.strip() for p in line.split(";")]
            if len(parts) == 2:
                new_persons.append({"given": parts[0], "family": parts[1]})
        
        if cr_book_type in ["monograph", "report-paper"]:
            bm["authors"] = new_persons
        else:
            bm["editors"] = new_persons

    st.markdown("---")
    st.subheader("Depositor & Batch Info")
    
    with st.expander("Depositor Details", expanded=False):
        col_d1, col_d2 = st.columns(2)
        with col_d1:
            depositor_name = st.text_input(
                "Depositor Name",
                value="Lib4RI - Library for the Research Institutes within the ETH Domain: Eawag, Empa, PSI & WSL"
            )
        with col_d2:
            depositor_email = st.text_input("Depositor Email", value="dora@lib4ri.ch")
        
        ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        
        batch_prefix = "book"
        if cr_book_type == "report-paper":
            batch_prefix = "report"
        elif cr_book_type == "monograph":
            batch_prefix = "monograph"
            
        doi_batch_id = st.text_input(
            "DOI Batch ID",
            value=f"{batch_prefix}_{ts}",
            help="Wird im XML-Header verwendet. Sollte eindeutig sein."
        )
            
        if "registrant" not in st.session_state:
            st.session_state.registrant = repo_config["registrant"]
        
        registrant = st.text_input("Registrant", value=st.session_state.registrant)
        st.session_state.registrant = registrant
    
        depositor_meta = {
            "depositor_name": depositor_name,
            "depositor_email": depositor_email,
            "registrant": st.session_state.registrant,
            "doi_batch_id": doi_batch_id
        }

    st.subheader("Kapitel / Inhalte")
    st.caption("Ein Eintrag pro Zeile: ID (z.B. wsl:12345) oder URL")

    st.markdown(
        "Gib **eine DORA-ID** (z.B. `wsl:41900`) oder eine **komplette MODS-URL** "
        "pro Zeile ein."
    )

    chapters_text = st.text_area("Kapitel-Liste", height=200, help="Liste der IDs oder URLs")

    st.markdown("---")
    st.subheader("XML Generierung")

    if st.button("Crossref XML generieren", type="primary"):
        try:
            chapter_items = []

            for line in chapters_text.splitlines():
                line = line.strip()
                if not line:
                    continue
                mods_urls = build_dora_mods_url(repo_code, line, host=dora_host)
                mods_root = fetch_mods_xml(mods_urls)
                ci, page_no = mods_to_content_item(mods_root, repo_base_url)
                chapter_items.append((ci, page_no))

            if not chapter_items and cr_book_type == "edited_book":
                st.warning("Keine Kapitel angegeben! Ein Edited Book sollte normalerweise Kapitel enthalten.")
            
            # book_meta aus session state / widgets zusammenbauen
            book_meta = {
                "book_title": st.session_state.book_title,
                "series_title": st.session_state.series_title,
                "series_issn": st.session_state.series_issn,
                "publisher_name": st.session_state.publisher_name,
                "pub_year": int(st.session_state.pub_year) if st.session_state.get("pub_year") else 0,
                "pub_month": st.session_state.pub_month,
                "pub_day": st.session_state.pub_day,
                "noisbn_reason": st.session_state.get("noisbn_reason", ""),
                "book_doi": st.session_state.book_doi,
                "book_resource": st.session_state.book_resource,
                "report_number": st.session_state.get("report_number", ""),
                "editors": new_persons if cr_book_type not in ["monograph", "report-paper"] else [],
                "authors": new_persons if cr_book_type in ["monograph", "report-paper"] else [],
            }

            xml_bytes = build_doi_batch_xml(book_meta, depositor_meta, chapter_items, book_type=cr_book_type)
            
            # Store in session state
            st.session_state.crossref_xml = xml_bytes
            st.session_state.crossref_filename = "crossref_edited_book.xml"
            
            st.success("Crossref XML erfolgreich erzeugt!")

            # Validierung gegen Crossref XSD Schema
            st.subheader("XML Validierung")
            with st.spinner("Validiere XML gegen Crossref Schema..."):
                is_valid, validation_errors = validate_crossref_xml(xml_bytes)

            if is_valid:
                st.success("✓ XML ist valide und bereit für Crossref!")
            else:
                st.error("✗ XML Validierung fehlgeschlagen:")
                for error in validation_errors:
                    st.error(f"  • {error}")
                st.warning("Das XML kann trotzdem heruntergeladen werden, wird aber möglicherweise von Crossref abgelehnt.")

        except Exception as e:
            st.error(f"Fehler bei der Erzeugung des XML: {e}")
            import traceback
            st.text(traceback.format_exc())

    # Display Download and Upload if XML exists in session state
    if "crossref_xml" in st.session_state:
        xml_bytes = st.session_state.crossref_xml
        
        # Download Button
        st.download_button(
            label="XML herunterladen",
            data=xml_bytes,
            file_name=st.session_state.crossref_filename,
            mime="application/xml"
        )

        # ---------------------------------------------------------
        # Crossref Upload Section
        # ---------------------------------------------------------
        st.markdown("---")
        st.subheader("Automatischer Upload zu Crossref")
        
        # Determine default role if not in session state
        if "cr_role" not in st.session_state:
             st.session_state.cr_role = REPO_CONFIG.get(st.session_state.last_repo_code, {}).get("role", "")

        col_u1, col_u2 = st.columns(2)
        with col_u1:
            cr_user = st.text_input("Crossref Username", value="dora@lib4ri.ch")
            # Use key to bind to session state
            cr_role = st.text_input("Crossref Role (wslx, empa, eawa, psit)", key="cr_role")
        with col_u2:
            cr_pass = st.text_input("Crossref Password", type="password")
        
        if st.button("Upload to Crossref"):
            if not cr_user or not cr_pass:
                st.error("Bitte Username und Passwort für Crossref angeben.")
            else:
                with st.spinner("Lade zu Crossref hoch..."):
                    res = upload_to_crossref(xml_bytes, cr_user, cr_pass, cr_role)
                    
                    if isinstance(res, str) and res.startswith("Exception"):
                            st.error(f"Upload fehlgeschlagen: {res}")
                    else:
                            # Crossref returns 200 even on some logic errors, text contains details
                            if res.status_code == 200:
                                if "successfully received" in res.text:
                                    st.success("Upload erfolgreich! Crossref hat die Datei empfangen.")
                                    with st.expander("Server-Antwort ansehen"):
                                        st.text(res.text)
                                else:
                                    st.warning("Upload technisch erfolgreich (HTTP 200), aber Crossref meldet eventuell Fehler.")
                                    with st.expander("Server-Antwort ansehen (Fehleranalyse)"):
                                        st.text(res.text)
                            else:
                                st.error(f"HTTP Fehler: {res.status_code}")
                                st.text(res.text)


def upload_to_crossref(xml_content, username, password, role=None):
    url = "https://doi.crossref.org/servlet/deposit"
    
    # Construct login_id with role if provided (format: username/role)
    login_id = username
    if role and role.strip():
        login_id = f"{username}/{role.strip()}"

    # Multipart form data
    # 'operation': 'doMDUpload'
    # 'login_id': username (or username/role)
    # 'login_passwd': password
    # 'fname': (filename, file_content, content_type)
    
    files = {
        'fname': ('crossref_submission.xml', xml_content, 'application/xml')
    }
    data = {
        'operation': 'doMDUpload',
        'login_id': login_id,
        'login_passwd': password
    }
    
    try:
        response = requests.post(url, files=files, data=data, timeout=60)
        return response
    except Exception as e:
        return f"Exception: {e}"

if __name__ == "__main__":
    main()