andrehoffmann80 commited on
Commit
cb6e177
·
verified ·
1 Parent(s): db474d9

Delete dashboard.py

Browse files
Files changed (1) hide show
  1. dashboard.py +0 -976
dashboard.py DELETED
@@ -1,976 +0,0 @@
1
- import datetime
2
- from urllib.parse import quote
3
-
4
- import requests
5
- from lxml import etree
6
- import streamlit as st
7
-
8
- # =====================================================================
9
- # Namespaces
10
- # =====================================================================
11
-
12
- CROSSREF_NS = "http://www.crossref.org/schema/4.4.2"
13
- XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
14
- JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1"
15
- XML_NS = "http://www.w3.org/XML/1998/namespace"
16
- AI_NS = "http://www.crossref.org/AccessIndicators.xsd"
17
- MODS_NS = "http://www.loc.gov/mods/v3"
18
- XML_LANG = f"{{{XML_NS}}}lang"
19
-
20
-
21
- # =====================================================================
22
- # Hilfsfunktionen
23
- # =====================================================================
24
-
25
- def clean_text(text: str) -> str:
26
- """Bereinigt Soft-Hyphen, PDF-Trennungen, Zeilenumbrüche – sonst unverändert."""
27
- if not text:
28
- return ""
29
- return (
30
- text.replace("\u00AD", "") # Soft Hyphen
31
- .replace("­", "") # alternative Soft Hyphen
32
- .replace("\n", " ")
33
- ).strip()
34
-
35
-
36
- def get_text(node, xpath, ns):
37
- elem = node.find(xpath, namespaces=ns)
38
- return clean_text(elem.text) if elem is not None and elem.text else ""
39
-
40
-
41
- def build_dora_mods_url(base_url: str, repo_code: str, object_or_url: str) -> str:
42
- """
43
- Erzeugt MODS-URL aus einer DORA-ID wie 'wsl:41900'.
44
- Wenn schon eine http(s)-URL übergeben wird, wird sie unverändert zurückgegeben.
45
- Nutzt nun die öffentliche OAI-PMH Schnittstelle, um IP-Blockaden auf HuggingFace zu vermeiden.
46
- """
47
- if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
48
- return object_or_url
49
-
50
- # Wandle z.B. "wsl:41891" in "wsl_41891" um für den OAI Identifier
51
- oai_id = object_or_url.replace(":", "_")
52
-
53
- # Wechsle von admin auf www, da www öffentlich zugänglich ist (und OAI dort verfügbar ist)
54
- base_url = base_url.replace("admin.dora", "www.dora").rstrip("/")
55
-
56
- return f"{base_url}/{repo_code}/oai2?verb=GetRecord&identifier=oai:dora:{oai_id}&metadataPrefix=mods"
57
-
58
-
59
- def build_persistent_url(repo_code: str, object_id: str) -> str:
60
- """
61
- Erzeugt die neue persistente URL im Format:
62
- https://www.dora.lib4ri.ch/{repo}/item/{id}
63
- """
64
- # Force public domain for persistent links
65
- public_base = "https://www.dora.lib4ri.ch"
66
- return f"{public_base}/{repo_code}/item/{object_id}"
67
-
68
-
69
- def fetch_mods_xml(mods_url: str) -> etree._Element:
70
- """Lädt eine MODS-Datei oder OAI-PMH von einer URL und gibt das MODS Root-Element zurück."""
71
- resp = requests.get(mods_url)
72
- resp.raise_for_status()
73
- # Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
74
- parser = etree.XMLParser(recover=True, remove_blank_text=True)
75
- root = etree.fromstring(resp.content, parser=parser)
76
-
77
- # Falls es sich um eine OAI-PMH Antwort handelt, extrahiere den <mods:mods> Knoten
78
- if "OAI-PMH" in root.tag:
79
- ns = {
80
- "oai": "http://www.openarchives.org/OAI/2.0/",
81
- "mods": "http://www.loc.gov/mods/v3"
82
- }
83
- mods_node = root.find(".//mods:mods", namespaces=ns)
84
- if mods_node is not None:
85
- return mods_node
86
- else:
87
- raise ValueError(f"Kein MODS-Element in der OAI-PMH-Antwort gefunden: {mods_url}")
88
-
89
- return root
90
-
91
-
92
- def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
93
- """Extrahiert Buch-Metadaten aus einem Buch-MODS-Record."""
94
- ns = book_root.nsmap.copy()
95
- if "mods" not in ns:
96
- ns["mods"] = MODS_NS
97
-
98
- # Buchtitel
99
- book_title = get_text(book_root, ".//mods:titleInfo/mods:title", ns)
100
-
101
- # Serie (falls vorhanden)
102
- series_title = get_text(
103
- book_root,
104
- ".//mods:relatedItem[@type='series']/mods:titleInfo/mods:title",
105
- ns
106
- )
107
- series_issn = get_text(
108
- book_root,
109
- ".//mods:relatedItem[@type='series']/mods:identifier[@type='issn']",
110
- ns
111
- )
112
-
113
- # Herausgeber (editor) & Autoren (author - für Monographs)
114
- editors = []
115
- authors = []
116
- for name in book_root.findall(".//mods:name[@type='personal']", ns):
117
- role = name.find("mods:role/mods:roleTerm", ns)
118
- if role is not None:
119
- role_text = role.text.lower()
120
- if role_text == "editor":
121
- given = get_text(name, "mods:namePart[@type='given']", ns)
122
- family = get_text(name, "mods:namePart[@type='family']", ns)
123
- editors.append({"given": given, "family": family})
124
- elif role_text == "author":
125
- given = get_text(name, "mods:namePart[@type='given']", ns)
126
- family = get_text(name, "mods:namePart[@type='family']", ns)
127
- # Authors at book level (for Monographs)
128
- authors.append({"given": given, "family": family})
129
-
130
- # Publisher
131
- publisher_name = get_text(book_root, ".//mods:originInfo/mods:publisher", ns)
132
-
133
- # Publikationsjahr (online)
134
- pub_year = get_text(
135
- book_root,
136
- ".//mods:originInfo/mods:dateIssued[@encoding='w3cdtf'][@keyDate='yes']",
137
- ns
138
- )
139
- if not pub_year:
140
- pub_year = get_text(book_root, ".//mods:originInfo/mods:dateIssued", ns)
141
-
142
- # DOI & URI
143
- book_doi = get_text(book_root, ".//mods:identifier[@type='doi']", ns)
144
-
145
- # Persistent URL format
146
- # Example: https://www.dora.lib4ri.ch/psi/item/psi:84778
147
- book_id = get_text(book_root, ".//mods:identifier[@type='local']", ns)
148
- if not book_id:
149
- # Fallback to building ID from DOI if possible, or use a placeholder
150
- book_id = book_doi.split("/")[-1] if book_doi else ""
151
-
152
- # Get repo_code from the ID itself (e.g. 'psi' from 'psi:84778')
153
- current_repo = book_id.split(":")[0] if ":" in book_id else repo_base_url.split("/")[-1]
154
- book_resource = build_persistent_url(current_repo, book_id) if book_id else ""
155
-
156
- # ISBN / noisbn
157
- isbn_val = get_text(book_root, ".//mods:identifier[@type='isbn']", ns)
158
- noisbn_reason = "archive_volume" if not isbn_val else None
159
-
160
- # Default to current date if not found/provided
161
- today = datetime.date.today()
162
-
163
- meta = {
164
- "book_title": book_title,
165
- "series_title": series_title or "",
166
- "series_issn": series_issn or "",
167
- "publisher_name": publisher_name,
168
- "pub_year": int(pub_year[:4]) if pub_year else today.year,
169
- "pub_month": str(today.month),
170
- "pub_day": str(today.day),
171
- "noisbn_reason": noisbn_reason or "",
172
- "book_doi": book_doi or "",
173
- "book_resource": book_resource or "",
174
- "report_number": "",
175
- "editors": editors,
176
- "authors": authors,
177
- }
178
- return meta
179
-
180
-
181
- def mods_to_content_item(mods_root: etree._Element, repo_base_url: str) -> tuple[etree._Element, int]:
182
- """Wandelt ein Kapitel-MODS in ein Crossref <content_item> um."""
183
- ns = mods_root.nsmap.copy()
184
- if "mods" not in ns:
185
- ns["mods"] = MODS_NS
186
-
187
- title = get_text(mods_root, ".//mods:titleInfo/mods:title", ns)
188
- doi = get_text(mods_root, ".//mods:identifier[@type='doi']", ns)
189
- year = get_text(mods_root, ".//mods:originInfo/mods:dateIssued", ns)
190
- abstract = get_text(mods_root, ".//mods:abstract", ns)
191
- first_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:start", ns)
192
- last_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:end", ns)
193
-
194
- # Autoren
195
- authors = []
196
- for name in mods_root.findall(".//mods:name[@type='personal']", ns):
197
- role = name.find("mods:role/mods:roleTerm", ns)
198
- if role is not None and role.text == "author":
199
- given = get_text(name, "mods:namePart[@type='given']", ns)
200
- family = get_text(name, "mods:namePart[@type='family']", ns)
201
- authors.append((given, family))
202
-
203
- ci = etree.Element("content_item", component_type="chapter")
204
-
205
- # Contributors
206
- contribs = etree.SubElement(ci, "contributors")
207
- for idx, (given, family) in enumerate(authors):
208
- pn = etree.SubElement(
209
- contribs,
210
- "person_name",
211
- sequence="first" if idx == 0 else "additional",
212
- contributor_role="author",
213
- )
214
- etree.SubElement(pn, "given_name").text = given
215
- etree.SubElement(pn, "surname").text = family
216
-
217
- # Titel
218
- titles = etree.SubElement(ci, "titles")
219
- etree.SubElement(titles, "title").text = title
220
-
221
- # Abstract (JATS)
222
- jats_abs = etree.SubElement(ci, f"{{{JATS_NS}}}abstract", {XML_LANG: "en"})
223
- p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p")
224
- p.text = abstract
225
-
226
- # Publikationsdatum
227
- pub = etree.SubElement(ci, "publication_date", media_type="online")
228
- if year:
229
- etree.SubElement(pub, "year").text = year[:4]
230
-
231
- # Seiten
232
- if first_page or last_page:
233
- pages = etree.SubElement(ci, "pages")
234
- if first_page:
235
- etree.SubElement(pages, "first_page").text = first_page
236
- if last_page:
237
- etree.SubElement(pages, "last_page").text = last_page
238
-
239
- # License information (AccessIndicators) - must come before doi_data
240
- ai_program = etree.SubElement(ci, f"{{{AI_NS}}}program", name="AccessIndicators")
241
- license_ref = etree.SubElement(ai_program, f"{{{AI_NS}}}license_ref")
242
- license_ref.text = "https://creativecommons.org/licenses/by/4.0/"
243
- license_ref.set("applies_to", "vor")
244
- license_ref.set("start_date", year[:4] + "-01-01" if year else "")
245
-
246
- # DOI
247
- if doi:
248
- doi_data = etree.SubElement(ci, "doi_data")
249
- etree.SubElement(doi_data, "doi").text = doi
250
-
251
- # New persistent URL format
252
- chapter_id = doi.split("/")[-1] if "/" in doi else doi
253
- repo_code_extracted = chapter_id.split(":")[0] if ":" in chapter_id else repo_base_url.split("/")[-1]
254
- etree.SubElement(
255
- doi_data,
256
- "resource"
257
- ).text = build_persistent_url(repo_code_extracted, chapter_id)
258
-
259
- # Sortierung nach first_page
260
- try:
261
- page_number = int(first_page)
262
- except Exception:
263
- page_number = 999999
264
-
265
- return ci, page_number
266
-
267
-
268
- def build_doi_batch_xml(
269
- book_meta: dict,
270
- depositor_meta: dict,
271
- chapter_items: list[tuple[etree._Element, int]],
272
- book_type: str = "edited_book",
273
- ) -> bytes:
274
- """
275
- Erzeugt Crossref-<doi_batch>.
276
- book_type: 'edited_book', 'monograph', oder 'report-paper' (custom internal flag).
277
- """
278
- doi_batch = etree.Element(
279
- "doi_batch",
280
- nsmap={
281
- None: CROSSREF_NS,
282
- "xsi": XSI_NS,
283
- "jats": JATS_NS,
284
- "ai": AI_NS,
285
- }
286
- )
287
- doi_batch.set("version", "4.4.2")
288
- doi_batch.set(
289
- f"{{{XSI_NS}}}schemaLocation",
290
- "http://www.crossref.org/schema/4.4.2 "
291
- "http://www.crossref.org/schema/deposit/crossref4.4.2.xsd"
292
- )
293
-
294
- # HEAD
295
- head = etree.SubElement(doi_batch, "head")
296
- etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
297
-
298
- ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
299
- etree.SubElement(head, "timestamp").text = ts
300
-
301
- depositor = etree.SubElement(head, "depositor")
302
- etree.SubElement(depositor, "depositor_name").text = depositor_meta["depositor_name"]
303
- etree.SubElement(depositor, "email_address").text = depositor_meta["depositor_email"]
304
-
305
- etree.SubElement(head, "registrant").text = depositor_meta["registrant"]
306
-
307
- # BODY
308
- body = etree.SubElement(doi_batch, "body")
309
-
310
- # Determine structure based on book_type
311
- if book_type == "report-paper":
312
- report_paper = etree.SubElement(body, "report-paper")
313
- # Decide between report-paper_metadata and report-paper_series_metadata
314
- has_series = (book_meta.get("series_title") or book_meta.get("series_issn"))
315
- if has_series:
316
- metadata_root = etree.SubElement(report_paper, "report-paper_series_metadata")
317
- # 1. SERIES METADATA (Required if using report-paper_series_metadata)
318
- series_metadata = etree.SubElement(metadata_root, "series_metadata")
319
- if book_meta.get("series_title"):
320
- stitles = etree.SubElement(series_metadata, "titles")
321
- etree.SubElement(stitles, "title").text = book_meta["series_title"]
322
- if book_meta.get("series_issn"):
323
- etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
324
- else:
325
- metadata_root = etree.SubElement(report_paper, "report-paper_metadata")
326
- else:
327
- # BOOK STRUCTURE (Edited Book or Monograph)
328
- book = etree.SubElement(body, "book", book_type=book_type)
329
- # If it's a monograph or edited book, we often use book_series_metadata or book_metadata
330
- # For simplicity and to match the schema, let's stick to book_series_metadata if series exists
331
- if book_meta.get("series_title") or book_meta.get("series_issn"):
332
- metadata_root = etree.SubElement(book, "book_series_metadata")
333
- series_metadata = etree.SubElement(metadata_root, "series_metadata")
334
- if book_meta.get("series_title"):
335
- stitles = etree.SubElement(series_metadata, "titles")
336
- etree.SubElement(stitles, "title").text = book_meta["series_title"]
337
- if book_meta.get("series_issn"):
338
- etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
339
- else:
340
- metadata_root = etree.SubElement(book, "book_metadata")
341
-
342
- # 2. CONTRIBUTORS
343
- # Contributors (Editors or Authors)
344
- contributors_list = []
345
- role = "editor"
346
- if book_type in ["monograph", "report-paper"]:
347
- contributors_list = book_meta.get("authors", [])
348
- role = "author"
349
- else:
350
- contributors_list = book_meta.get("editors", [])
351
- role = "editor"
352
-
353
- if contributors_list:
354
- contribs = etree.SubElement(metadata_root, "contributors")
355
- for idx, person in enumerate(contributors_list):
356
- pn = etree.SubElement(
357
- contribs,
358
- "person_name",
359
- sequence="first" if idx == 0 else "additional",
360
- contributor_role=role
361
- )
362
- etree.SubElement(pn, "given_name").text = person["given"]
363
- etree.SubElement(pn, "surname").text = person["family"]
364
-
365
- # 3. TITLES
366
- titles = etree.SubElement(metadata_root, "titles")
367
- etree.SubElement(titles, "title").text = book_meta["book_title"]
368
-
369
- # 4. PUBLICATION DATE
370
- pub = etree.SubElement(metadata_root, "publication_date", media_type="online")
371
- if book_meta.get("pub_month") and book_meta.get("pub_month").strip():
372
- try:
373
- etree.SubElement(pub, "month").text = f"{int(book_meta['pub_month']):02d}"
374
- except ValueError:
375
- pass
376
- if book_meta.get("pub_day") and book_meta.get("pub_day").strip():
377
- try:
378
- etree.SubElement(pub, "day").text = f"{int(book_meta['pub_day']):02d}"
379
- except ValueError:
380
- pass
381
- etree.SubElement(pub, "year").text = str(book_meta["pub_year"])
382
-
383
- # 5. NOISBN (only for books)
384
- if book_type != "report-paper":
385
- if book_meta.get("noisbn_reason"):
386
- etree.SubElement(metadata_root, "noisbn", reason=book_meta["noisbn_reason"])
387
-
388
- # 6. PUBLISHER
389
- pub_node = etree.SubElement(metadata_root, "publisher")
390
- etree.SubElement(pub_node, "publisher_name").text = book_meta["publisher_name"]
391
-
392
- # 7. PUBLISHER ITEM (Report Number) - Only for report-paper
393
- if book_type == "report-paper" and book_meta.get("report_number"):
394
- publisher_item = etree.SubElement(metadata_root, "publisher_item")
395
- etree.SubElement(publisher_item, "identifier", id_type="report-number").text = book_meta["report_number"]
396
-
397
- # 8. DOI DATA
398
- if book_meta.get("book_doi") or book_meta.get("book_resource"):
399
- doi_data = etree.SubElement(metadata_root, "doi_data")
400
- if book_meta.get("book_doi"):
401
- etree.SubElement(doi_data, "doi").text = book_meta["book_doi"]
402
- if book_meta.get("book_resource"):
403
- etree.SubElement(doi_data, "resource").text = book_meta["book_resource"]
404
-
405
- # 10. COMPONENTS (Chapters)
406
- # Sort and append chapters
407
- if book_type != "report-paper":
408
- # For books, chapters are children of <book> node
409
- # But wait, in the loop below we append to 'book' variable.
410
- # 'book' variable is only defined if book_type != 'report-paper'.
411
- pass
412
-
413
- chapter_items.sort(key=lambda x: x[1])
414
- for ci, _page in chapter_items:
415
- if book_type == "report-paper":
416
- report_paper.append(ci)
417
- else:
418
- book.append(ci)
419
-
420
- xml_bytes = etree.tostring(
421
- doi_batch,
422
- pretty_print=True,
423
- encoding="UTF-8",
424
- xml_declaration=True
425
- )
426
- return xml_bytes
427
-
428
-
429
- class CrossrefSchemaResolver(etree.Resolver):
430
- """Custom resolver to fetch included XSD schemas from Crossref and W3C."""
431
-
432
- def resolve(self, url, id, context):
433
- # Map of known schema locations
434
- schema_map = {
435
- 'mathml3-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-content.xsd',
436
- 'mathml3-presentation.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-presentation.xsd',
437
- 'mathml3-strict-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-strict-content.xsd',
438
- 'mathml3-common.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-common.xsd',
439
- }
440
-
441
- # Determine the URL to fetch
442
- if url.startswith("http://") or url.startswith("https://"):
443
- schema_url = url
444
- elif url in schema_map:
445
- schema_url = schema_map[url]
446
- else:
447
- schema_url = f"https://www.crossref.org/schemas/{url}"
448
-
449
- try:
450
- response = requests.get(schema_url, timeout=15)
451
- response.raise_for_status()
452
- return self.resolve_string(response.content, context)
453
- except Exception:
454
- # If fetching fails, return None to use default behavior
455
- return None
456
-
457
-
458
- def validate_crossref_xml(xml_bytes: bytes) -> tuple[bool, list[str]]:
459
- """
460
- Validiert Crossref XML gegen das offizielle XSD Schema.
461
-
462
- Returns:
463
- tuple: (is_valid, error_messages)
464
- """
465
- errors = []
466
-
467
- try:
468
- # Parse XML
469
- doc = etree.fromstring(xml_bytes)
470
-
471
- # Crossref XSD Schema URL
472
- schema_url = "https://www.crossref.org/schemas/crossref4.4.2.xsd"
473
-
474
- # Download schema (mit Caching in Session State für Performance)
475
- if 'crossref_schema' not in st.session_state:
476
- try:
477
- # Create parser with custom resolver
478
- parser = etree.XMLParser()
479
- parser.resolvers.add(CrossrefSchemaResolver())
480
-
481
- # Download main schema
482
- schema_resp = requests.get(schema_url, timeout=30)
483
- schema_resp.raise_for_status()
484
-
485
- # Parse schema with resolver
486
- schema_doc = etree.fromstring(schema_resp.content, parser)
487
- st.session_state.crossref_schema = etree.XMLSchema(schema_doc)
488
- except Exception as e:
489
- errors.append(f"Fehler beim Laden des XSD Schemas: {e}")
490
- return False, errors
491
-
492
- schema = st.session_state.crossref_schema
493
-
494
- # Validierung
495
- is_valid = schema.validate(doc)
496
-
497
- if not is_valid:
498
- for error in schema.error_log:
499
- errors.append(f"Zeile {error.line}: {error.message}")
500
-
501
- return is_valid, errors
502
-
503
- except etree.XMLSyntaxError as e:
504
- errors.append(f"XML Syntax Fehler: {e}")
505
- return False, errors
506
- except Exception as e:
507
- errors.append(f"Unerwarteter Fehler: {e}")
508
- return False, errors
509
-
510
-
511
- # =====================================================================
512
- # REPOSITORY CONFIGURATION
513
- # =====================================================================
514
-
515
- REPO_CONFIG = {
516
- "wsl": {
517
- "publisher": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
518
- "registrant": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
519
- "prefix": "10.55419",
520
- "role": "wslx"
521
- },
522
- "psi": {
523
- "publisher": "Paul Scherrer Institute, PSI",
524
- "registrant": "Paul Scherrer Institute, PSI",
525
- "prefix": "10.55402",
526
- "role": "psit"
527
- },
528
- "empa": {
529
- "publisher": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
530
- "registrant": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
531
- "prefix": "10.55368",
532
- "role": "empa"
533
- },
534
- "eawag": {
535
- "publisher": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
536
- "registrant": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
537
- "prefix": "10.55408",
538
- "role": "eawa"
539
- }
540
- }
541
-
542
- def main():
543
- st.title("Crossref XML Generator/Uploader")
544
-
545
- st.markdown(
546
- "Dieses Dashboard lädt **MODS-Metadaten direkt aus DORA** mittels IDs "
547
- "und erzeugt ein vollständiges Crossref-XML (`doi_batch`) für Reports (WSL Berichte und PSI Berichte) und Edited Books/Conference Proceedings."
548
- )
549
-
550
- st.subheader("Konfiguration & Quelle")
551
-
552
- col_config, col_source = st.columns(2)
553
-
554
- with col_config:
555
- st.markdown("#### Verbindung & Typ")
556
- base_url = st.text_input(
557
- "DORA Basis-URL",
558
- value="https://admin.dora.lib4ri.ch"
559
- )
560
-
561
- repo_list = list(REPO_CONFIG.keys())
562
- repo_code = st.selectbox(
563
- "Repository-Code",
564
- options=repo_list,
565
- index=0,
566
- format_func=lambda x: x.upper()
567
- )
568
-
569
- repo_config = REPO_CONFIG[repo_code]
570
- repo_base_url = f"{base_url.rstrip('/')}/{repo_code}"
571
-
572
- pub_type = st.radio(
573
- "Publikationstyp",
574
- ("Edited Book", "Report (WSL, Monograph Series)", "Report (Eawag, PSI, Paper Series)"),
575
- horizontal=False
576
- )
577
-
578
- # Mapping auf Crossref book_type / report type
579
- cr_book_type = "edited_book"
580
- if "Monograph" in pub_type:
581
- cr_book_type = "monograph"
582
- elif "Paper Series" in pub_type:
583
- cr_book_type = "report-paper"
584
-
585
- with col_source:
586
- st.markdown("#### MODS-Quelle")
587
- # Dynamic default ID based on repo
588
- default_id = "41891"
589
- if repo_code == "psi":
590
- default_id = "84057"
591
-
592
- book_id_or_url = st.text_input(
593
- "DORA-ID oder MODS-URL",
594
- value=f"{repo_code}:{default_id}",
595
- help="Beispiel: wsl:41900 oder komplette URL"
596
- )
597
-
598
- st.write("") # Spacer
599
- if st.button("Metadaten laden", type="primary"):
600
- try:
601
- mods_url = build_dora_mods_url(base_url, repo_code, book_id_or_url)
602
- st.info(f"Lade MODS von: {mods_url}")
603
- book_root = fetch_mods_xml(mods_url)
604
- meta = parse_book_mods(book_root, repo_base_url)
605
-
606
- # --- Attempt to extract report number from MODS ---
607
- ns = book_root.nsmap.copy()
608
- if "mods" not in ns:
609
- ns["mods"] = MODS_NS
610
- report_num = get_text(book_root, ".//mods:identifier[@type='report number']", ns)
611
- if not report_num:
612
- report_num = get_text(book_root, ".//mods:identifier[@type='report-number']", ns)
613
-
614
- if not report_num:
615
- # Check <note type="report number">
616
- report_num = get_text(book_root, ".//mods:note[@type='report number']", ns)
617
-
618
- if report_num:
619
- meta["report_number"] = report_num
620
- st.info(f"Report Number gefunden: {report_num}")
621
- # --------------------------------------------------
622
-
623
- # Update flat fields in session state for widgets
624
- for k, v in meta.items():
625
- if k in ["book_title", "series_title", "series_issn", "publisher_name",
626
- "pub_year", "pub_month", "pub_day", "noisbn_reason",
627
- "book_doi", "book_resource", "report_number"]:
628
- st.session_state[k] = v
629
- st.session_state.book_meta[k] = v
630
-
631
- # Special handling for persons text area
632
- if cr_book_type in ["monograph", "report-paper"]:
633
- current_list = meta.get("authors", [])
634
- else:
635
- current_list = meta.get("editors", [])
636
- st.session_state["persons_input"] = "\n".join(f"{e['given']};{e['family']}" for e in current_list)
637
-
638
- st.session_state.book_meta_loaded = True
639
- st.success("Metadaten erfolgreich geladen.")
640
- st.rerun()
641
- except Exception as e:
642
- st.error(f"Fehler beim Laden der MODS: {e}")
643
- import traceback
644
- st.text(traceback.format_exc())
645
-
646
- # Session State Init Logic (unchanged but placed after UI definition for clarity in reading flow, strictly it runs before inputs generally)
647
- if "book_meta_loaded" not in st.session_state:
648
- st.session_state.book_meta_loaded = False
649
-
650
- # Current date for defaults
651
- today = datetime.date.today()
652
-
653
- # Initialize session state keys for widgets if not present
654
- if "book_title" not in st.session_state:
655
- st.session_state.book_title = ""
656
- if "series_title" not in st.session_state:
657
- st.session_state.series_title = ""
658
- if "series_issn" not in st.session_state:
659
- st.session_state.series_issn = ""
660
- if "publisher_name" not in st.session_state:
661
- st.session_state.publisher_name = repo_config["publisher"]
662
- if "pub_year" not in st.session_state:
663
- st.session_state.pub_year = today.year
664
- if "pub_month" not in st.session_state:
665
- st.session_state.pub_month = str(today.month)
666
- if "pub_day" not in st.session_state:
667
- st.session_state.pub_day = str(today.day)
668
- if "noisbn_reason" not in st.session_state:
669
- st.session_state.noisbn_reason = ""
670
- if "book_doi" not in st.session_state:
671
- st.session_state.book_doi = ""
672
- if "book_resource" not in st.session_state:
673
- st.session_state.book_resource = ""
674
- if "report_number" not in st.session_state:
675
- st.session_state.report_number = ""
676
- if "persons_input" not in st.session_state:
677
- st.session_state.persons_input = ""
678
-
679
- if "book_meta" not in st.session_state:
680
- st.session_state.book_meta = {
681
- "book_title": "",
682
- "series_title": "",
683
- "series_issn": "",
684
- "publisher_name": repo_config["publisher"],
685
- "pub_year": today.year,
686
- "pub_month": str(today.month),
687
- "pub_day": str(today.day),
688
- "noisbn_reason": "",
689
- "book_doi": "",
690
- "book_resource": "",
691
- "report_number": "",
692
- "editors": [],
693
- "authors": [],
694
- }
695
-
696
- # CHECK: has the repo code changed since last run?
697
- if "last_repo_code" not in st.session_state:
698
- st.session_state.last_repo_code = repo_code
699
- st.session_state.registrant = repo_config["registrant"]
700
- st.session_state.cr_role = repo_config.get("role", "")
701
-
702
- if st.session_state.last_repo_code != repo_code:
703
- # Repo changed! Update defaults
704
- st.session_state.publisher_name = repo_config["publisher"]
705
- st.session_state.book_meta["publisher_name"] = repo_config["publisher"]
706
- st.session_state.registrant = repo_config["registrant"]
707
-
708
- # If the user hasn't typed anything yet or if we force update?
709
- # Let's force update the role in session state so the input widget picks it up
710
- st.session_state.cr_role = repo_config.get("role", "")
711
-
712
- st.session_state.last_repo_code = repo_code
713
-
714
- st.markdown("---")
715
- st.subheader("Metadaten & Inhalte")
716
-
717
- # Use expander for metadata editing to keep UI clean
718
- with st.expander("Metadaten bearbeiten", expanded=True):
719
- bm = st.session_state.book_meta
720
-
721
- col_b1, col_b2 = st.columns(2)
722
- with col_b1:
723
- st.text_input("Titel", key="book_title")
724
- st.text_input("Serientitel", key="series_title")
725
- st.text_input("Serien-ISSN", key="series_issn")
726
- st.text_input("Publisher Name", key="publisher_name")
727
-
728
- if cr_book_type == "report-paper":
729
- st.text_input("Report Number", key="report_number")
730
-
731
- with col_b2:
732
- c_y, c_m, c_d = st.columns(3)
733
- with c_y:
734
- st.number_input("Jahr", min_value=1900, max_value=2100, key="pub_year")
735
- with c_m:
736
- st.text_input("Monat", key="pub_month")
737
- with c_d:
738
- st.text_input("Tag", key="pub_day")
739
-
740
- if cr_book_type != "report-paper":
741
- st.text_input("noisbn reason", key="noisbn_reason")
742
-
743
- st.markdown("##### Identifikatoren")
744
- col_id1, col_id2 = st.columns(2)
745
- with col_id1:
746
- st.text_input("DOI", key="book_doi")
747
- with col_id2:
748
- st.text_input("Resource URL", key="book_resource")
749
-
750
- st.caption(f"Basis DOI Prefix: {repo_config['prefix']}")
751
-
752
- st.markdown("##### Mitwirkende")
753
- # Decide label based on type
754
- if cr_book_type in ["monograph", "report-paper"]:
755
- st.info("Bitte **Autoren** eintragen (Vorname;Nachname).")
756
- label = "Autoren"
757
- else:
758
- st.info("Bitte **Editoren** eintragen (Vorname;Nachname).")
759
- label = "Editoren"
760
-
761
- persons_text = st.text_area(label, key="persons_input", height=100)
762
-
763
- # Parse and save back
764
- new_persons = []
765
- for line in persons_text.splitlines():
766
- line = line.strip()
767
- if not line:
768
- continue
769
- parts = [p.strip() for p in line.split(";")]
770
- if len(parts) == 2:
771
- new_persons.append({"given": parts[0], "family": parts[1]})
772
-
773
- if cr_book_type in ["monograph", "report-paper"]:
774
- bm["authors"] = new_persons
775
- else:
776
- bm["editors"] = new_persons
777
-
778
- st.markdown("---")
779
- st.subheader("Depositor & Batch Info")
780
-
781
- with st.expander("Depositor Details", expanded=False):
782
- col_d1, col_d2 = st.columns(2)
783
- with col_d1:
784
- depositor_name = st.text_input(
785
- "Depositor Name",
786
- value="Lib4RI - Library for the Research Institutes within the ETH Domain: Eawag, Empa, PSI & WSL"
787
- )
788
- with col_d2:
789
- depositor_email = st.text_input("Depositor Email", value="dora@lib4ri.ch")
790
-
791
- ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
792
-
793
- batch_prefix = "book"
794
- if cr_book_type == "report-paper":
795
- batch_prefix = "report"
796
- elif cr_book_type == "monograph":
797
- batch_prefix = "monograph"
798
-
799
- doi_batch_id = st.text_input(
800
- "DOI Batch ID",
801
- value=f"{batch_prefix}_{ts}",
802
- help="Wird im XML-Header verwendet. Sollte eindeutig sein."
803
- )
804
-
805
- if "registrant" not in st.session_state:
806
- st.session_state.registrant = repo_config["registrant"]
807
-
808
- registrant = st.text_input("Registrant", value=st.session_state.registrant)
809
- st.session_state.registrant = registrant
810
-
811
- depositor_meta = {
812
- "depositor_name": depositor_name,
813
- "depositor_email": depositor_email,
814
- "registrant": st.session_state.registrant,
815
- "doi_batch_id": doi_batch_id
816
- }
817
-
818
- st.subheader("Kapitel / Inhalte")
819
- st.caption("Ein Eintrag pro Zeile: ID (z.B. wsl:12345) oder URL")
820
-
821
- st.markdown(
822
- "Gib **eine DORA-ID** (z.B. `wsl:41900`) oder eine **komplette MODS-URL** "
823
- "pro Zeile ein."
824
- )
825
-
826
- chapters_text = st.text_area("Kapitel-Liste", height=200, help="Liste der IDs oder URLs")
827
-
828
- st.markdown("---")
829
- st.subheader("XML Generierung")
830
-
831
- if st.button("Crossref XML generieren", type="primary"):
832
- try:
833
- chapter_items = []
834
-
835
- for line in chapters_text.splitlines():
836
- line = line.strip()
837
- if not line:
838
- continue
839
- mods_url = build_dora_mods_url(base_url, repo_code, line)
840
- st.write(f"Lade Kapitel-MODS von: {mods_url}")
841
- mods_root = fetch_mods_xml(mods_url)
842
- ci, page_no = mods_to_content_item(mods_root, repo_base_url)
843
- chapter_items.append((ci, page_no))
844
-
845
- if not chapter_items and cr_book_type == "edited_book":
846
- st.warning("Keine Kapitel angegeben! Ein Edited Book sollte normalerweise Kapitel enthalten.")
847
-
848
- # book_meta aus session state / widgets zusammenbauen
849
- book_meta = {
850
- "book_title": st.session_state.book_title,
851
- "series_title": st.session_state.series_title,
852
- "series_issn": st.session_state.series_issn,
853
- "publisher_name": st.session_state.publisher_name,
854
- "pub_year": int(st.session_state.pub_year) if st.session_state.get("pub_year") else 0,
855
- "pub_month": st.session_state.pub_month,
856
- "pub_day": st.session_state.pub_day,
857
- "noisbn_reason": st.session_state.get("noisbn_reason", ""),
858
- "book_doi": st.session_state.book_doi,
859
- "book_resource": st.session_state.book_resource,
860
- "report_number": st.session_state.get("report_number", ""),
861
- "editors": new_persons if cr_book_type not in ["monograph", "report-paper"] else [],
862
- "authors": new_persons if cr_book_type in ["monograph", "report-paper"] else [],
863
- }
864
-
865
- xml_bytes = build_doi_batch_xml(book_meta, depositor_meta, chapter_items, book_type=cr_book_type)
866
-
867
- # Store in session state
868
- st.session_state.crossref_xml = xml_bytes
869
- st.session_state.crossref_filename = "crossref_edited_book.xml"
870
-
871
- st.success("Crossref XML erfolgreich erzeugt!")
872
-
873
- # Validierung gegen Crossref XSD Schema
874
- st.subheader("XML Validierung")
875
- with st.spinner("Validiere XML gegen Crossref Schema..."):
876
- is_valid, validation_errors = validate_crossref_xml(xml_bytes)
877
-
878
- if is_valid:
879
- st.success("✓ XML ist valide und bereit für Crossref!")
880
- else:
881
- st.error("✗ XML Validierung fehlgeschlagen:")
882
- for error in validation_errors:
883
- st.error(f" • {error}")
884
- st.warning("Das XML kann trotzdem heruntergeladen werden, wird aber möglicherweise von Crossref abgelehnt.")
885
-
886
- except Exception as e:
887
- st.error(f"Fehler bei der Erzeugung des XML: {e}")
888
- import traceback
889
- st.text(traceback.format_exc())
890
-
891
- # Display Download and Upload if XML exists in session state
892
- if "crossref_xml" in st.session_state:
893
- xml_bytes = st.session_state.crossref_xml
894
-
895
- # Download Button
896
- st.download_button(
897
- label="XML herunterladen",
898
- data=xml_bytes,
899
- file_name=st.session_state.crossref_filename,
900
- mime="application/xml"
901
- )
902
-
903
- # ---------------------------------------------------------
904
- # Crossref Upload Section
905
- # ---------------------------------------------------------
906
- st.markdown("---")
907
- st.subheader("Automatischer Upload zu Crossref")
908
-
909
- # Determine default role if not in session state
910
- if "cr_role" not in st.session_state:
911
- st.session_state.cr_role = REPO_CONFIG.get(st.session_state.last_repo_code, {}).get("role", "")
912
-
913
- col_u1, col_u2 = st.columns(2)
914
- with col_u1:
915
- cr_user = st.text_input("Crossref Username", value="dora@lib4ri.ch")
916
- # Use key to bind to session state
917
- cr_role = st.text_input("Crossref Role (wslx, empa, eawa, psit)", key="cr_role")
918
- with col_u2:
919
- cr_pass = st.text_input("Crossref Password", type="password")
920
-
921
- if st.button("Upload to Crossref"):
922
- if not cr_user or not cr_pass:
923
- st.error("Bitte Username und Passwort für Crossref angeben.")
924
- else:
925
- with st.spinner("Lade zu Crossref hoch..."):
926
- res = upload_to_crossref(xml_bytes, cr_user, cr_pass, cr_role)
927
-
928
- if isinstance(res, str) and res.startswith("Exception"):
929
- st.error(f"Upload fehlgeschlagen: {res}")
930
- else:
931
- # Crossref returns 200 even on some logic errors, text contains details
932
- if res.status_code == 200:
933
- if "successfully received" in res.text:
934
- st.success("Upload erfolgreich! Crossref hat die Datei empfangen.")
935
- with st.expander("Server-Antwort ansehen"):
936
- st.text(res.text)
937
- else:
938
- st.warning("Upload technisch erfolgreich (HTTP 200), aber Crossref meldet eventuell Fehler.")
939
- with st.expander("Server-Antwort ansehen (Fehleranalyse)"):
940
- st.text(res.text)
941
- else:
942
- st.error(f"HTTP Fehler: {res.status_code}")
943
- st.text(res.text)
944
-
945
-
946
- def upload_to_crossref(xml_content, username, password, role=None):
947
- url = "https://doi.crossref.org/servlet/deposit"
948
-
949
- # Construct login_id with role if provided (format: username/role)
950
- login_id = username
951
- if role and role.strip():
952
- login_id = f"{username}/{role.strip()}"
953
-
954
- # Multipart form data
955
- # 'operation': 'doMDUpload'
956
- # 'login_id': username (or username/role)
957
- # 'login_passwd': password
958
- # 'fname': (filename, file_content, content_type)
959
-
960
- files = {
961
- 'fname': ('crossref_submission.xml', xml_content, 'application/xml')
962
- }
963
- data = {
964
- 'operation': 'doMDUpload',
965
- 'login_id': login_id,
966
- 'login_passwd': password
967
- }
968
-
969
- try:
970
- response = requests.post(url, files=files, data=data, timeout=60)
971
- return response
972
- except Exception as e:
973
- return f"Exception: {e}"
974
-
975
- if __name__ == "__main__":
976
- main()