andrehoffmann80 commited on
Commit
aaac4b1
·
verified ·
1 Parent(s): 7176c9c

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +239 -40
src/streamlit_app.py CHANGED
@@ -1,40 +1,239 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from lxml import etree
3
+
4
+ # =====================================================================
5
+ # CONFIGURATION
6
+ # =====================================================================
7
+
8
+ MODS_DIR = "mods_records" # Ordner mit MODS-Kapiteln
9
+ OUTPUT_XML = "crossref.xml"
10
+
11
+ # Namespaces
12
+ JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1"
13
+ XML_NS = "http://www.w3.org/XML/1998/namespace"
14
+ XML_LANG = f"{{{XML_NS}}}lang"
15
+
16
+ NSMAP = {
17
+ "jats": JATS_NS,
18
+ "xlink": "http://www.w3.org/1999/xlink",
19
+ }
20
+
21
+ # =====================================================================
22
+ # TEXT CLEANING: Entfernt nur Silbentrennungsartefakte (Option 1)
23
+ # =====================================================================
24
+
25
+ def clean_text(text):
26
+ """Bereinigt Soft-Hyphen, PDF-Trennungen, ersetzt aber nichts anderes."""
27
+ if not text:
28
+ return ""
29
+ return (
30
+ text.replace("\u00AD", "") # Soft Hyphen
31
+ .replace("­", "") # alternative Soft Hyphen
32
+ .replace("\n", " ") # Zeilenumbrüche entfernen
33
+ ).strip()
34
+
35
+
36
+ # =====================================================================
37
+ # XML HELPERS
38
+ # =====================================================================
39
+
40
+ def get_text(node, xpath, ns):
41
+ """Safely extract text content using an xpath."""
42
+ elem = node.find(xpath, namespaces=ns)
43
+ return clean_text(elem.text) if elem is not None and elem.text else ""
44
+
45
+
46
+ # =====================================================================
47
+ # PARSE A SINGLE MODS FILE INTO <content_item>
48
+ # =====================================================================
49
+
50
+ def mods_to_content_item(mods_path):
51
+ tree = etree.parse(mods_path)
52
+ root = tree.getroot()
53
+ ns = root.nsmap
54
+
55
+ # --------------------------------------------------------
56
+ # Extract metadata
57
+ # --------------------------------------------------------
58
+ title = get_text(root, ".//mods:titleInfo/mods:title", ns)
59
+ doi = get_text(root, ".//mods:identifier[@type='doi']", ns)
60
+ year = get_text(root, ".//mods:originInfo/mods:dateIssued", ns)
61
+ abstract = get_text(root, ".//mods:abstract", ns)
62
+
63
+ first_page = get_text(root, ".//mods:extent[@unit='page']/mods:start", ns)
64
+ last_page = get_text(root, ".//mods:extent[@unit='page']/mods:end", ns)
65
+
66
+ # --------------------------------------------------------
67
+ # Extract authors
68
+ # --------------------------------------------------------
69
+ authors = []
70
+ for name in root.findall(".//mods:name[@type='personal']", ns):
71
+ role = name.find("mods:role/mods:roleTerm", ns)
72
+ if role is not None and role.text == "author":
73
+ given = get_text(name, "mods:namePart[@type='given']", ns)
74
+ family = get_text(name, "mods:namePart[@type='family']", ns)
75
+ authors.append((given, family))
76
+
77
+ # --------------------------------------------------------
78
+ # Build <content_item>
79
+ # --------------------------------------------------------
80
+ ci = etree.Element("content_item", component_type="chapter")
81
+
82
+ # Contributors
83
+ contribs = etree.SubElement(ci, "contributors")
84
+ for idx, (given, family) in enumerate(authors):
85
+ pn = etree.SubElement(
86
+ contribs,
87
+ "person_name",
88
+ sequence="first" if idx == 0 else "additional",
89
+ contributor_role="author",
90
+ )
91
+ etree.SubElement(pn, "given_name").text = given
92
+ etree.SubElement(pn, "surname").text = family
93
+
94
+ # Titles
95
+ titles = etree.SubElement(ci, "titles")
96
+ etree.SubElement(titles, "title").text = title
97
+
98
+ # Abstract (JATS)
99
+ jats_abs = etree.SubElement(ci, f"{{{JATS_NS}}}abstract", {XML_LANG: "en"})
100
+ p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p")
101
+ p.text = abstract
102
+
103
+ # Publication date
104
+ pub = etree.SubElement(ci, "publication_date", media_type="online")
105
+ etree.SubElement(pub, "year").text = year
106
+
107
+ # Pages
108
+ if first_page or last_page:
109
+ pages = etree.SubElement(ci, "pages")
110
+ if first_page:
111
+ etree.SubElement(pages, "first_page").text = first_page
112
+ if last_page:
113
+ etree.SubElement(pages, "last_page").text = last_page
114
+
115
+ # DOI block
116
+ if doi:
117
+ doi_data = etree.SubElement(ci, "doi_data")
118
+ etree.SubElement(doi_data, "doi").text = doi
119
+
120
+ doi_tail = doi.split(":")[-1]
121
+ etree.SubElement(
122
+ doi_data,
123
+ "resource"
124
+ ).text = f"https://www.dora.lib4ri.ch/wsl/islandora/object/{doi_tail}"
125
+
126
+ # Sorting helper: use first_page numeric value if available
127
+ try:
128
+ page_number = int(first_page)
129
+ except:
130
+ page_number = 999999
131
+
132
+ return ci, page_number
133
+
134
+
135
+ # =====================================================================
136
+ # MAIN: Assemble full Crossref XML
137
+ # =====================================================================
138
+
139
+ def assemble_crossref(mods_dir, output_path):
140
+
141
+ # Root <book>
142
+ book = etree.Element(
143
+ "book",
144
+ book_type="edited_book",
145
+ nsmap=NSMAP
146
+ )
147
+
148
+ # ----------------------------------------------------------------
149
+ # FIXED BOOK METADATA (Editors, Publisher, Series, DOI)
150
+ # ----------------------------------------------------------------
151
+ metadata = etree.XML("""
152
+ <book_series_metadata>
153
+ <series_metadata>
154
+ <titles><title>WSL Berichte</title></titles>
155
+ <issn>22963456</issn>
156
+ </series_metadata>
157
+
158
+ <contributors>
159
+ <person_name sequence="first" contributor_role="editor">
160
+ <given_name>Alexander</given_name>
161
+ <surname>Bast</surname>
162
+ </person_name>
163
+ <person_name sequence="additional" contributor_role="editor">
164
+ <given_name>Michael</given_name>
165
+ <surname>Bründl</surname>
166
+ </person_name>
167
+ <person_name sequence="additional" contributor_role="editor">
168
+ <given_name>Marcia</given_name>
169
+ <surname>Phillips</surname>
170
+ </person_name>
171
+ </contributors>
172
+
173
+ <titles>
174
+ <title>WSL research programme Climate Change Impacts on Alpine Mass Movements - CCAMM project report</title>
175
+ </titles>
176
+
177
+ <publication_date media_type="online">
178
+ <month>12</month>
179
+ <day>08</day>
180
+ <year>2025</year>
181
+ </publication_date>
182
+
183
+ <noisbn reason="archive_volume"/>
184
+
185
+ <publisher>
186
+ <publisher_name>Swiss Federal Institute for Forest, Snow and Landscape Research, WSL</publisher_name>
187
+ </publisher>
188
+
189
+ <doi_data>
190
+ <doi>10.55419/wsl:41891</doi>
191
+ <resource>https://www.dora.lib4ri.ch/wsl/islandora/object/wsl:41891</resource>
192
+ </doi_data>
193
+ </book_series_metadata>
194
+ """, parser=etree.XMLParser(remove_blank_text=True))
195
+
196
+ book.append(metadata)
197
+
198
+ # ----------------------------------------------------------------
199
+ # Process ALL MODS chapters
200
+ # ----------------------------------------------------------------
201
+ chapters = []
202
+
203
+ for filename in sorted(os.listdir(mods_dir)):
204
+ if filename.lower().endswith(".xml"):
205
+ path = os.path.join(mods_dir, filename)
206
+ print(f"Processing MODS file: {path}")
207
+ ci, page_number = mods_to_content_item(path)
208
+ chapters.append((page_number, ci))
209
+
210
+ # Sort by first page
211
+ chapters.sort(key=lambda x: x[0])
212
+
213
+ # Append all chapter blocks
214
+ for _, chapter in chapters:
215
+ book.append(chapter)
216
+
217
+ # ----------------------------------------------------------------
218
+ # WRITE OUTPUT FILE
219
+ # ----------------------------------------------------------------
220
+ xml_bytes = etree.tostring(
221
+ book,
222
+ pretty_print=True,
223
+ encoding="UTF-8",
224
+ xml_declaration=True
225
+ )
226
+
227
+ with open(output_path, "wb") as f:
228
+ f.write(xml_bytes)
229
+
230
+ print("Crossref XML successfully written to:", output_path)
231
+
232
+
233
+ # =====================================================================
234
+ # RUN SCRIPT
235
+ # =====================================================================
236
+
237
+ if __name__ == "__main__":
238
+ assemble_crossref(MODS_DIR, OUTPUT_XML)
239
+ print("DONE.")