Spaces:

andrehoffmann80
/

DOI

Sleeping

App Files Files Community

andrehoffmann80 commited on Apr 1

Commit

aaac4b1

verified ·

1 Parent(s): 7176c9c

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +239 -40

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,239 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import os
+from lxml import etree
+# =====================================================================
+# CONFIGURATION
+# =====================================================================
+MODS_DIR = "mods_records"          # Ordner mit MODS-Kapiteln
+OUTPUT_XML = "crossref.xml"
+# Namespaces
+JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1"
+XML_NS = "http://www.w3.org/XML/1998/namespace"
+XML_LANG = f"{{{XML_NS}}}lang"
+NSMAP = {
+    "jats": JATS_NS,
+    "xlink": "http://www.w3.org/1999/xlink",
+}
+# =====================================================================
+# TEXT CLEANING: Entfernt nur Silbentrennungsartefakte (Option 1)
+# =====================================================================
+def clean_text(text):
+    """Bereinigt Soft-Hyphen, PDF-Trennungen, ersetzt aber nichts anderes."""
+    if not text:
+        return ""
+    return (
+        text.replace("\u00AD", "")   # Soft Hyphen
+            .replace("", "")       # alternative Soft Hyphen
+            .replace("\n", " ")     # Zeilenumbrüche entfernen
+    ).strip()
+# =====================================================================
+# XML HELPERS
+# =====================================================================
+def get_text(node, xpath, ns):
+    """Safely extract text content using an xpath."""
+    elem = node.find(xpath, namespaces=ns)
+    return clean_text(elem.text) if elem is not None and elem.text else ""
+# =====================================================================
+# PARSE A SINGLE MODS FILE INTO <content_item>
+# =====================================================================
+def mods_to_content_item(mods_path):
+    tree = etree.parse(mods_path)
+    root = tree.getroot()
+    ns = root.nsmap
+    # --------------------------------------------------------
+    # Extract metadata
+    # --------------------------------------------------------
+    title = get_text(root, ".//mods:titleInfo/mods:title", ns)
+    doi = get_text(root, ".//mods:identifier[@type='doi']", ns)
+    year = get_text(root, ".//mods:originInfo/mods:dateIssued", ns)
+    abstract = get_text(root, ".//mods:abstract", ns)
+    first_page = get_text(root, ".//mods:extent[@unit='page']/mods:start", ns)
+    last_page = get_text(root, ".//mods:extent[@unit='page']/mods:end", ns)
+    # --------------------------------------------------------
+    # Extract authors
+    # --------------------------------------------------------
+    authors = []
+    for name in root.findall(".//mods:name[@type='personal']", ns):
+        role = name.find("mods:role/mods:roleTerm", ns)
+        if role is not None and role.text == "author":
+            given = get_text(name, "mods:namePart[@type='given']", ns)
+            family = get_text(name, "mods:namePart[@type='family']", ns)
+            authors.append((given, family))
+    # --------------------------------------------------------
+    # Build <content_item>
+    # --------------------------------------------------------
+    ci = etree.Element("content_item", component_type="chapter")
+    # Contributors
+    contribs = etree.SubElement(ci, "contributors")
+    for idx, (given, family) in enumerate(authors):
+        pn = etree.SubElement(
+            contribs,
+            "person_name",
+            sequence="first" if idx == 0 else "additional",
+            contributor_role="author",
+        )
+        etree.SubElement(pn, "given_name").text = given
+        etree.SubElement(pn, "surname").text = family
+    # Titles
+    titles = etree.SubElement(ci, "titles")
+    etree.SubElement(titles, "title").text = title
+    # Abstract (JATS)
+    jats_abs = etree.SubElement(ci, f"{{{JATS_NS}}}abstract", {XML_LANG: "en"})
+    p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p")
+    p.text = abstract
+    # Publication date
+    pub = etree.SubElement(ci, "publication_date", media_type="online")
+    etree.SubElement(pub, "year").text = year
+    # Pages
+    if first_page or last_page:
+        pages = etree.SubElement(ci, "pages")
+        if first_page:
+            etree.SubElement(pages, "first_page").text = first_page
+        if last_page:
+            etree.SubElement(pages, "last_page").text = last_page
+    # DOI block
+    if doi:
+        doi_data = etree.SubElement(ci, "doi_data")
+        etree.SubElement(doi_data, "doi").text = doi
+        doi_tail = doi.split(":")[-1]
+        etree.SubElement(
+            doi_data,
+            "resource"
+        ).text = f"https://www.dora.lib4ri.ch/wsl/islandora/object/{doi_tail}"
+    # Sorting helper: use first_page numeric value if available
+    try:
+        page_number = int(first_page)
+    except:
+        page_number = 999999
+    return ci, page_number
+# =====================================================================
+# MAIN: Assemble full Crossref XML
+# =====================================================================
+def assemble_crossref(mods_dir, output_path):
+    # Root <book>
+    book = etree.Element(
+        "book",
+        book_type="edited_book",
+        nsmap=NSMAP
+    )
+    # ----------------------------------------------------------------
+    # FIXED BOOK METADATA (Editors, Publisher, Series, DOI)
+    # ----------------------------------------------------------------
+    metadata = etree.XML("""
+      <book_series_metadata>
+        <series_metadata>
+          <titles><title>WSL Berichte</title></titles>
+          <issn>22963456</issn>
+        </series_metadata>
+        <contributors>
+          <person_name sequence="first" contributor_role="editor">
+            <given_name>Alexander</given_name>
+            <surname>Bast</surname>
+          </person_name>
+          <person_name sequence="additional" contributor_role="editor">
+            <given_name>Michael</given_name>
+            <surname>Bründl</surname>
+          </person_name>
+          <person_name sequence="additional" contributor_role="editor">
+            <given_name>Marcia</given_name>
+            <surname>Phillips</surname>
+          </person_name>
+        </contributors>
+        <titles>
+          <title>WSL research programme Climate Change Impacts on Alpine Mass Movements - CCAMM project report</title>
+        </titles>
+        <publication_date media_type="online">
+          <month>12</month>
+          <day>08</day>
+          <year>2025</year>
+        </publication_date>
+        <noisbn reason="archive_volume"/>
+        <publisher>
+          <publisher_name>Swiss Federal Institute for Forest, Snow and Landscape Research, WSL</publisher_name>
+        </publisher>
+        <doi_data>
+          <doi>10.55419/wsl:41891</doi>
+          <resource>https://www.dora.lib4ri.ch/wsl/islandora/object/wsl:41891</resource>
+        </doi_data>
+      </book_series_metadata>
+    """, parser=etree.XMLParser(remove_blank_text=True))
+    book.append(metadata)
+    # ----------------------------------------------------------------
+    # Process ALL MODS chapters
+    # ----------------------------------------------------------------
+    chapters = []
+    for filename in sorted(os.listdir(mods_dir)):
+        if filename.lower().endswith(".xml"):
+            path = os.path.join(mods_dir, filename)
+            print(f"Processing MODS file: {path}")
+            ci, page_number = mods_to_content_item(path)
+            chapters.append((page_number, ci))
+    # Sort by first page
+    chapters.sort(key=lambda x: x[0])
+    # Append all chapter blocks
+    for _, chapter in chapters:
+        book.append(chapter)
+    # ----------------------------------------------------------------
+    # WRITE OUTPUT FILE
+    # ----------------------------------------------------------------
+    xml_bytes = etree.tostring(
+        book,
+        pretty_print=True,
+        encoding="UTF-8",
+        xml_declaration=True
+    )
+    with open(output_path, "wb") as f:
+        f.write(xml_bytes)
+    print("Crossref XML successfully written to:", output_path)
+# =====================================================================
+# RUN SCRIPT
+# =====================================================================
+if __name__ == "__main__":
+    assemble_crossref(MODS_DIR, OUTPUT_XML)
+    print("DONE.")