andrehoffmann80 commited on
Commit
9336543
·
verified ·
1 Parent(s): 345393f

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +346 -0
  2. converter.py +484 -0
  3. requirements.txt +3 -3
  4. utils.py +161 -0
app.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import io
4
+ import requests
5
+ from lxml import etree
6
+ from converter import ModsConverter
7
+ from urllib.parse import quote, unquote
8
+
9
+ st.set_page_config(
10
+ page_title="PubTypeConverter | DORA Tools",
11
+ page_icon="🔄",
12
+ layout="wide",
13
+ initial_sidebar_state="collapsed"
14
+ )
15
+
16
+ # Custom CSS for a modern, premium look
17
+ st.markdown("""
18
+ <style>
19
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');
20
+
21
+ html, body, [class*="css"] {
22
+ font-family: 'Inter', sans-serif;
23
+ }
24
+
25
+ .stApp {
26
+ background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
27
+ }
28
+
29
+ /* Premium Header */
30
+ .title-container {
31
+ padding: 2rem 0;
32
+ text-align: center;
33
+ background: rgba(255, 255, 255, 0.4);
34
+ backdrop-filter: blur(10px);
35
+ border-radius: 20px;
36
+ margin-bottom: 2rem;
37
+ border: 1px solid rgba(255, 255, 255, 0.5);
38
+ box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.07);
39
+ }
40
+
41
+ .main-title {
42
+ font-size: 3rem;
43
+ font-weight: 700;
44
+ background: linear-gradient(90deg, #1e3a8a, #3b82f6);
45
+ -webkit-background-clip: text;
46
+ -webkit-text-fill-color: transparent;
47
+ margin-bottom: 0.5rem;
48
+ }
49
+
50
+ .sub-title {
51
+ color: #64748b;
52
+ font-size: 1.1rem;
53
+ }
54
+
55
+ /* Section Styling */
56
+ .stSelectbox, .stTextInput, .stButton {
57
+ margin-bottom: 1rem;
58
+ }
59
+
60
+ /* Card-like containers for results */
61
+ .result-card {
62
+ background: white;
63
+ padding: 1.5rem;
64
+ border-radius: 15px;
65
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
66
+ margin-bottom: 1.5rem;
67
+ border-left: 5px solid #3b82f6;
68
+ }
69
+
70
+ /* Step indicators */
71
+ .step-header {
72
+ font-weight: 600;
73
+ color: #1e293b;
74
+ margin-bottom: 1rem;
75
+ display: flex;
76
+ align-items: center;
77
+ gap: 0.5rem;
78
+ }
79
+
80
+ .step-number {
81
+ background: #3b82f6;
82
+ color: white;
83
+ width: 24px;
84
+ height: 24px;
85
+ border-radius: 50%;
86
+ display: flex;
87
+ align-items: center;
88
+ justify-content: center;
89
+ font-size: 0.8rem;
90
+ }
91
+ </style>
92
+
93
+ <div class="title-container">
94
+ <div class="main-title">DORA PubTypeConverter</div>
95
+ <div class="sub-title">DORA Publication Type Transformation Helper</div>
96
+ </div>
97
+ """, unsafe_allow_html=True)
98
+
99
+ # Setup paths
100
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
101
+ RESOURCE_DIR = os.path.join(BASE_DIR, "PubTypeConverter_resources")
102
+ TEMPLATE_DIR = os.path.join(RESOURCE_DIR, "PubTypeConverter_templates")
103
+ CONFIG_FILE = os.path.join(RESOURCE_DIR, "PubTypeConverterConfig.xml")
104
+
105
+ # Initialize Session State
106
+ if 'converter' not in st.session_state:
107
+ converter = ModsConverter()
108
+ if os.path.exists(CONFIG_FILE):
109
+ converter.load_config(CONFIG_FILE)
110
+ else:
111
+ st.warning(f"Configuration file not found at {CONFIG_FILE}. Content moving rules will not be applied.")
112
+ st.session_state.converter = converter
113
+
114
+ if 'loaded_files' not in st.session_state:
115
+ st.session_state.loaded_files = [] # List of dicts: {'name': str, 'content': bytes}
116
+
117
+ # Load templates
118
+ templates = []
119
+ if os.path.exists(TEMPLATE_DIR):
120
+ for f in os.listdir(TEMPLATE_DIR):
121
+ if f.lower().endswith(".xml"):
122
+ templates.append(f)
123
+ else:
124
+ st.error(f"Template directory not found: {TEMPLATE_DIR}")
125
+ templates.sort()
126
+
127
+
128
+ # Helper function to fetch from DORA
129
+ def fetch_from_dora(pid_or_url, repo):
130
+ url = pid_or_url.strip()
131
+
132
+ # Handle double-encoded PIDs (e.g. psi%253A84411 -> psi%3A84411 -> psi:84411)
133
+ # We unquote until the string stabilizes
134
+ prev = None
135
+ while url != prev:
136
+ prev = url
137
+ url = unquote(url)
138
+ fallback_www = None
139
+ fallback_oai = None
140
+
141
+ if not url.startswith("http"):
142
+ # Construct URL from PID
143
+ # Assume PID format like "psi:12345" or just "12345"
144
+ if ":" in url:
145
+ parts = url.split(":")
146
+ repo_prefix = parts[0].lower()
147
+ # Force prefix to lowercase for repo mapping
148
+ pid_val = f"{repo_prefix}:{parts[1]}"
149
+ repo = repo_prefix
150
+ else:
151
+ # Use selected repo
152
+ pid_val = f"{repo}:{url}"
153
+
154
+ # URL Encode the PID part - essential for Islandora
155
+ quoted_pid = quote(pid_val)
156
+
157
+ # Primary: Admin (Intranet, preferred)
158
+ url = f"https://admin.dora.lib4ri.ch/{repo}/islandora/object/{quoted_pid}/datastream/MODS/view"
159
+
160
+ # Fallback 1: WWW (Public mirror)
161
+ fallback_www = f"https://www.dora.lib4ri.ch/{repo}/islandora/object/{quoted_pid}/datastream/MODS/view"
162
+
163
+ # Fallback 2: OAI-PMH (Robust Public Access)
164
+ # Use quoted PID for identifier to be safe (suggested by user example)
165
+ fallback_oai = f"https://www.dora.lib4ri.ch/{repo}/oai2/request?verb=GetRecord&metadataPrefix=mods&identifier={quoted_pid}"
166
+
167
+ headers = {'User-Agent': 'curl/7.68.0'}
168
+
169
+ # helper to check OAI response for valid content
170
+ def check_oai_response(content):
171
+ try:
172
+ root = etree.fromstring(content)
173
+
174
+ # Check for OAI error code
175
+ if root.xpath(".//*[local-name()='error']"):
176
+ return False
177
+
178
+ # Ultimate robust search: find first 'mods' element regardless of namespace/prefix
179
+ mods_nodes = root.xpath(".//*[local-name()='mods']")
180
+ if mods_nodes:
181
+ return etree.tostring(mods_nodes[0], encoding='utf-8')
182
+ except:
183
+ pass
184
+ return None
185
+
186
+ try:
187
+ # Try primary URL
188
+ response = requests.get(url, headers=headers, timeout=10)
189
+ response.raise_for_status()
190
+ return response.content, url
191
+ except Exception as e:
192
+ status_errors = [f"Admin: {e}"]
193
+
194
+ # Try Fallback 1: WWW
195
+ if fallback_www:
196
+ try:
197
+ response = requests.get(fallback_www, headers=headers, timeout=10)
198
+ response.raise_for_status()
199
+ return response.content, fallback_www
200
+ except Exception as e2:
201
+ status_errors.append(f"WWW: {e2}")
202
+
203
+ # Try Fallback 2: OAI-PMH
204
+ if fallback_oai:
205
+ try:
206
+ response = requests.get(fallback_oai, headers=headers, timeout=10)
207
+ if response.status_code == 200:
208
+ mods_content = check_oai_response(response.content)
209
+ if mods_content:
210
+ return mods_content, fallback_oai
211
+ else:
212
+ status_errors.append("OAI: Valid HTTP but no MODS found in response")
213
+ else:
214
+ status_errors.append(f"OAI: HTTP {response.status_code}")
215
+ except Exception as e3:
216
+ status_errors.append(f"OAI: {e3}")
217
+
218
+ return None, f"Failed to fetch. Details: {'; '.join(status_errors)}"
219
+
220
+ # UI Layout
221
+ main_col1, main_col2 = st.columns([0.6, 0.4], gap="large")
222
+
223
+ with main_col1:
224
+ st.markdown('<div class="step-header"><div class="step-number">1</div><span>Select Source Data</span></div>', unsafe_allow_html=True)
225
+
226
+ input_tab1, input_tab2 = st.tabs(["🌐 Pull from DORA", "📁 Upload Local XML"])
227
+
228
+ with input_tab1:
229
+ dora_col1, dora_col2 = st.columns([0.7, 0.3])
230
+ dora_input = dora_col1.text_input("PID or URL", placeholder="e.g. psi:84411", label_visibility="collapsed")
231
+ repo_select = dora_col2.selectbox("Repo", ["psi", "eawag", "empa", "wsl"], label_visibility="collapsed")
232
+
233
+ if st.button("Fetch and Load Record", use_container_width=True):
234
+ if dora_input:
235
+ with st.spinner("Retrieving from DORA..."):
236
+ content, error_or_url = fetch_from_dora(dora_input, repo_select)
237
+ if content:
238
+ filename = dora_input.replace(":", "_").replace("/", "_") + ".xml"
239
+ if filename.startswith("http"): filename = "dora_record.xml"
240
+ st.session_state.loaded_files.append({"name": filename, "content": content, "source": error_or_url})
241
+ st.toast(f"Loaded {filename}", icon="✅")
242
+ else:
243
+ st.error(f"Fetch failed: {error_or_url}")
244
+ else:
245
+ st.warning("Please provide a identifier first.")
246
+
247
+ with input_tab2:
248
+ uploaded_files = st.file_uploader("Upload MODS XML files", type=['xml'], accept_multiple_files=True, label_visibility="collapsed")
249
+
250
+ # Display loaded files in a modern list
251
+ if st.session_state.loaded_files:
252
+ st.markdown("### Loaded Documents")
253
+ for i, file_data in enumerate(st.session_state.loaded_files):
254
+ with st.container():
255
+ f_col1, f_col2 = st.columns([0.85, 0.15])
256
+ f_col1.markdown(f"📄 **{file_data['name']}**")
257
+ if f_col2.button("🗑️", key=f"remove_{i}", help="Remove this file"):
258
+ st.session_state.loaded_files.pop(i)
259
+ st.rerun()
260
+
261
+ with main_col2:
262
+ st.markdown('<div class="step-header"><div class="step-number">2</div><span>Target Format</span></div>', unsafe_allow_html=True)
263
+ selected_template = st.selectbox("Choose the destination publication type", templates if templates else ["No templates found"], label_visibility="collapsed")
264
+
265
+ st.markdown("---")
266
+ if st.button("🚀 Start Conversion", disabled=not (uploaded_files or st.session_state.loaded_files) or not templates, use_container_width=True, type="primary"):
267
+ st.session_state.start_convert = True
268
+ else:
269
+ st.session_state.start_convert = False
270
+
271
+ # Combine sources
272
+ all_files = []
273
+ if uploaded_files:
274
+ for f in uploaded_files:
275
+ all_files.append({"name": f.name, "content": f.getvalue()})
276
+ if st.session_state.loaded_files:
277
+ all_files.extend(st.session_state.loaded_files)
278
+
279
+ if st.session_state.get("start_convert"):
280
+ st.markdown('<div class="step-header"><div class="step-number">3</div><span>Conversion Reports</span></div>', unsafe_allow_html=True)
281
+
282
+ for file_data in all_files:
283
+ content = file_data['content']
284
+ filename = file_data['name']
285
+
286
+ # Try to decode if bytes, though lxml can parse bytes directly
287
+ # But we pass string/bytes/path to converter
288
+
289
+ template_path = os.path.join(TEMPLATE_DIR, selected_template)
290
+
291
+ try:
292
+ # result_xml is the XML string, log is the structured data dict
293
+ result_xml, log_data = st.session_state.converter.convert(content, template_path)
294
+
295
+ with st.container():
296
+ st.markdown(f"""
297
+ <div class="result-card">
298
+ <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1rem;">
299
+ <span style="font-weight: 700; font-size: 1.2rem; color: #1e3a8a;">{filename}</span>
300
+ <span style="background: #dbeafe; color: #1e40af; padding: 0.2rem 0.8rem; border-radius: 999px; font-size: 0.8rem; font-weight: 600;">TRANSFORMED</span>
301
+ </div>
302
+ <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 1.5rem;">
303
+ <span style="background: #f1f5f9; padding: 0.4rem 1rem; border-radius: 8px; font-size: 0.9rem; border: 1px solid #e2e8f0;">{log_data['old_genre']}</span>
304
+ <span style="color: #94a3b8;">➡️</span>
305
+ <span style="background: #ecfdf5; color: #065f46; padding: 0.4rem 1rem; border-radius: 8px; font-size: 0.9rem; border: 1px solid #d1fae5; font-weight: 600;">{log_data['new_genre']}</span>
306
+ </div>
307
+ """, unsafe_allow_html=True)
308
+
309
+ s_col1, s_col2, s_col3 = st.columns(3)
310
+ s_col1.metric("Transfers", len(log_data["moves"]))
311
+ s_col2.metric("Additions", len(log_data["additions"]))
312
+ s_col3.metric("Removals", len(log_data["deletions"]))
313
+
314
+ with st.expander("Audit Transformation Details", expanded=False):
315
+ if log_data["moves"]:
316
+ st.write("**🔄 Content Transfers**")
317
+ for m in log_data["moves"]:
318
+ st.caption(f"• {m['summary']}")
319
+ if log_data["additions"]:
320
+ st.write("**✨ Smart Additions**")
321
+ for a in log_data["additions"]:
322
+ st.caption(f"• {a['summary']}")
323
+ if log_data["deletions"]:
324
+ st.write("**🗑️ Legacy Cleanup**")
325
+ del_labels = [d['label'] for d in log_data["deletions"]]
326
+ st.caption(f"Removed {len(del_labels)} unused fields: " + ", ".join(del_labels[:8]) + ("..." if len(del_labels) > 8 else ""))
327
+
328
+ if log_data["warnings"]:
329
+ for w in log_data["warnings"]:
330
+ st.warning(w)
331
+
332
+ if result_xml:
333
+ st.download_button(
334
+ label=f"⬇️ Download {filename}",
335
+ data=result_xml,
336
+ file_name=f"{os.path.splitext(filename)[0]}_converted.xml",
337
+ mime="application/xml",
338
+ key=f"dl_{filename}",
339
+ use_container_width=True
340
+ )
341
+ st.markdown('</div>', unsafe_allow_html=True)
342
+ except Exception as e:
343
+ st.error(f"Error converting {filename}: {e}")
344
+ import traceback
345
+ st.code(traceback.format_exc())
346
+
converter.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import copy
3
+ from lxml import etree
4
+ from typing import Dict, List, Tuple, Set
5
+ from utils import XmlHelper, DateInfo, NodeInfo, TAG_GENRE, TAG_ORIGIN_INFO, TAG_DATE_ISSUED, TAG_DATE_OTHER, ATTR_REPORTING_YEAR
6
+
7
+ class ModsConverter:
8
+ def __init__(self):
9
+ self.move_config = {}
10
+
11
+ def load_config(self, config_path: str):
12
+ """Loads configuration for moving content."""
13
+ if not os.path.exists(config_path):
14
+ return
15
+
16
+ parser = etree.XMLParser(remove_blank_text=True)
17
+ try:
18
+ tree = etree.parse(config_path, parser)
19
+ root = tree.getroot()
20
+
21
+ for conversion in root.findall("pubTypeConversion"):
22
+ p1 = conversion.find("pubType1")
23
+ p2 = conversion.find("pubType2")
24
+ if p1 is None or p2 is None: continue
25
+
26
+ pt1 = p1.text
27
+ pt2 = p2.text
28
+
29
+ moves = []
30
+ for mc in conversion.findall("moveContent"):
31
+ e1 = mc.find("element1")
32
+ e2 = mc.find("element2")
33
+ if e1 is not None and e2 is not None:
34
+ # Store as list of elements to traverse matching path
35
+ moves.append((list(e1), list(e2)))
36
+
37
+ if moves:
38
+ self.move_config[self._get_key(pt1, pt2)] = moves
39
+ # Reverse
40
+ self.move_config[self._get_key(pt2, pt1)] = [(m[1], m[0]) for m in moves]
41
+
42
+ except Exception as e:
43
+ print(f"Error loading config: {e}")
44
+
45
+ def _get_key(self, p1, p2):
46
+ if not p1 or not p2: return ""
47
+ return f"{p1.lower()}{p2.lower()}"
48
+
49
+ def convert(self, input_xml_str: str, template_xml_path: str) -> Tuple[str, dict]:
50
+ """
51
+ Converts the input XML based on the template.
52
+ Returns the result XML string and a structured log dictionary.
53
+ """
54
+ log_data = {
55
+ "old_genre": "",
56
+ "new_genre": "",
57
+ "moves": [],
58
+ "additions": [],
59
+ "deletions": [],
60
+ "warnings": []
61
+ }
62
+
63
+ # Parse inputs
64
+ try:
65
+ input_tree = XmlHelper.parse_xml(input_xml_str)
66
+ input_root = input_tree.getroot()
67
+ template_tree = XmlHelper.parse_xml(template_xml_path)
68
+ template_root = template_tree.getroot()
69
+ except ValueError as e:
70
+ log_data["warnings"].append(f"Error parsing XML: {e}")
71
+ return "", log_data
72
+
73
+ # 1. Exchange Genre
74
+ input_genre = XmlHelper.get_genre_node(input_root)
75
+ template_genre = XmlHelper.get_genre_node(template_root)
76
+
77
+ if input_genre is None or template_genre is None:
78
+ log_data["warnings"].append("Missing genre element in input or template.")
79
+ return "", log_data
80
+
81
+ old_genre = input_genre.text
82
+ new_genre = template_genre.text
83
+
84
+ log_data["old_genre"] = old_genre
85
+ log_data["new_genre"] = new_genre
86
+
87
+ # Update genre text and attributes
88
+ input_genre.text = new_genre
89
+ input_genre.attrib.clear()
90
+ input_genre.attrib.update(template_genre.attrib)
91
+
92
+ # 2. Move Content
93
+ key = self._get_key(old_genre, new_genre)
94
+ if key in self.move_config:
95
+ moves = self.move_config[key]
96
+
97
+ for source_def, dest_def in moves:
98
+ self._apply_move(input_root, source_def, dest_def, log_data)
99
+
100
+ # 3. Delete Extra Content
101
+ # We need a set of all valid paths from template
102
+ template_nodes_info = XmlHelper.get_all_nodes_info(template_root)
103
+ template_paths = {n.name for n in template_nodes_info}
104
+
105
+ # Re-scan input nodes after move
106
+ input_nodes_info = XmlHelper.get_all_nodes_info(input_root)
107
+
108
+
109
+ # We iterate and remove.
110
+ # Logic: If node path not in template, delete it.
111
+ # "check which nodes of input file are not contained in template - and delete them"
112
+ # "if parent node is empty now, delete it too"
113
+
114
+ # We should iterate such that we don't try to access removed nodes.
115
+ # But `input_nodes_info` creates a snapshot.
116
+ # Checking `node.getparent()` will return None if already removed?
117
+ # Actually lxml keeps parent ref even if removed from tree? No, `getparent()` returns None if removed.
118
+
119
+ # We need to process this carefully. Java iterates the *snapshot* list.
120
+ # "check if parent node still exists - because it could have been deleted in a step before"
121
+
122
+ for node_info in input_nodes_info:
123
+ # Skip empty names (like root if it resolved to empty)
124
+ if not node_info.name: continue
125
+
126
+ # Case insensitive check for paths
127
+ template_paths_lower = [p.lower() for p in template_paths]
128
+ is_in_template = node_info.name.lower() in template_paths_lower
129
+
130
+ # Specific loose matching rules
131
+ if not is_in_template:
132
+ try:
133
+ tag = etree.QName(node_info.node).localname.lower()
134
+
135
+ # Rule 1: Allow 'affiliation' with any attributes if a bare 'affiliation' exists in template
136
+ # AND parent path matches.
137
+ if tag == 'affiliation':
138
+ # Construct relaxed path: remove attributes from the last segment
139
+ # Format: "parent | affiliation [type=group]" -> "parent | affiliation"
140
+ last_sep = node_info.name.rfind(" | ")
141
+ if last_sep != -1:
142
+ parent_part = node_info.name[:last_sep]
143
+ # We assume the parent path part is correct (since parent wasn't deleted if we are here...
144
+ # well, actually we are iterating a snapshot, so parent MIGHT be deleted,
145
+ # but we check parent is not none later on deletion)
146
+
147
+ # Construct potential template path: parent + strict tag name
148
+ # We use the tag name from the node, but stripped of attributes
149
+ relaxed_candidate = f"{parent_part} | {etree.QName(node_info.node).localname}"
150
+
151
+ if relaxed_candidate.lower() in template_paths_lower:
152
+ is_in_template = True
153
+
154
+ # Rule 2: Always preserve 'alternativeName' and its children if parent 'name' is preserved
155
+ # (implied by parent path match, but we need to check if we are Inside an alternativeName tree)
156
+ # Or just 'alternativeName' tag itself.
157
+ # The path for children would be "name | alternativeName | namePart"
158
+
159
+ # Check if current tag is alternativeName OR if any parent in path is alternativeName
160
+ # node_info.name contains full path.
161
+ if 'alternativeName' in node_info.name:
162
+ # We need to be careful not to preserve it if the parent NAME itself was deleted?
163
+ # But we are iterating inputs. Parents are processed?
164
+ # Actually we iterate flat list. If parent was deleted, we might validly delete child.
165
+ # But here we are deciding if we SHOULD delete.
166
+
167
+ # If the path contains alternativeName, we check if the base path (up to name) is valid?
168
+ # Simpler: If it's alternativeName or child of it, Assume preserved IF parent exists.
169
+ # The loop logic "input_nodes_info" contains all nodes.
170
+ # If we say `is_in_template = True`, we keep it.
171
+ # If parent `name` was removed, then `alternativeName` would be removed automatically?
172
+ # No, `parent.remove(node)` removes it from tree.
173
+ # But we are iterating a snapshot.
174
+ # `if parent is not None:` check handles if parent was already removed/detached?
175
+ # Yes, if `name` was removed, `alternativeName.getparent()` (which is that name node)
176
+ # is still that node object (it's consistent in lxml), BUT that name node is no longer in tree.
177
+ # Wait, if `name` is removed from `mods`, `name.getparent()` might be None?
178
+ # lxml: "When an element is removed from its parent, it is not destroyed... getparent() returns None"
179
+ # So if parent `name` was removed in previous iteration, `parent` here will be None (or the name node, but name node's parent is None).
180
+ # Actually `node.getparent()` returns the parent element.
181
+ # If parent element was removed from ITS parent, `node.getparent()` still returns the parent element.
182
+ # It's only if `node` was removed from `parent` that `getparent()` is None.
183
+
184
+ # So we need to ensure we don't keep it if parent is "gone" effectively?
185
+ # But the standard logic deletes children if parent is deleted?
186
+ # "if parent node is empty now, delete it too" - that's post-deletion cleanup.
187
+
188
+ # If we mark `alternativeName` as "in template" (preserved), we just DON'T delete it explicitly here.
189
+ # If its parent `name` was deleted, then `alternativeName` effectively goes with it.
190
+ # So we just need to say: "Don't delete alternativeName just because it's missing from template".
191
+
192
+ is_in_template = True
193
+
194
+ except:
195
+ pass
196
+
197
+ if not is_in_template:
198
+ # Node isn't in template.
199
+ node = node_info.node
200
+ parent = node.getparent()
201
+
202
+ if parent is not None:
203
+ # Log if it has content
204
+ text = node.text
205
+ if text and text.strip() and not node_info.has_child_elements:
206
+ label = node_info.name.split(" | ")[-1]
207
+ log_data["deletions"].append({
208
+ "path": node_info.name,
209
+ "label": label,
210
+ "value": text.strip()
211
+ })
212
+
213
+ # Remove
214
+ parent.remove(node)
215
+
216
+ # Remove empty parents
217
+ self._remove_empty_parents(parent)
218
+
219
+
220
+ # 4. Sync Template Defaults (Additions)
221
+ # Anything in template that has text but is missing in input should be added
222
+ input_nodes_info_final = XmlHelper.get_all_nodes_info(input_root)
223
+ input_paths_final = {n.name.lower() for n in input_nodes_info_final}
224
+
225
+ for t_info in template_nodes_info:
226
+ if t_info.name.lower() not in input_paths_final:
227
+ t_node = t_info.node
228
+ # Only sync if it has actual text (default value)
229
+ if t_node.text and t_node.text.strip():
230
+ # Construct the path chain from t_node to root
231
+ path_elements = []
232
+ curr = t_node
233
+ while curr is not None and curr != template_root:
234
+ path_elements.insert(0, (curr.tag, curr.attrib))
235
+ curr = curr.getparent()
236
+
237
+ if path_elements:
238
+ # Find insertion point
239
+ current_parent = input_root
240
+ for tag, attrib in path_elements:
241
+ match = None
242
+ for child in current_parent:
243
+ # Loose match for sync purposes
244
+ if child.tag == tag:
245
+ match = child
246
+ break
247
+
248
+ if match is not None:
249
+ current_parent = match
250
+ else:
251
+ # Create new
252
+ new_elem = etree.Element(tag)
253
+ new_elem.attrib.update(attrib)
254
+ current_parent.append(new_elem)
255
+ current_parent = new_elem
256
+
257
+ # Set text
258
+ current_parent.text = t_node.text
259
+
260
+ # Better label for addition
261
+ label = t_info.name.split(" | ")[-1]
262
+ log_data["additions"].append({
263
+ "path": t_info.name,
264
+ "label": label,
265
+ "value": t_node.text,
266
+ "summary": f"Set default {label} to '{t_node.text}'"
267
+ })
268
+ # Add to final paths to avoid duplicates if siblings match
269
+ input_paths_final.add(t_info.name.lower())
270
+
271
+ # 5. Handle Dates
272
+ try:
273
+ date_info_input = XmlHelper.find_date_nodes(input_root)
274
+ date_info_template = XmlHelper.find_date_nodes(template_root)
275
+
276
+ if date_info_input.both_dates_in_same_block != date_info_template.both_dates_in_same_block:
277
+
278
+ # We need nodes to manipulate.
279
+ d_issued = date_info_input.date_issued_node
280
+ d_reporting = date_info_input.reporting_year_node
281
+
282
+ if d_issued is None or d_reporting is None:
283
+ # Can't manipulate if missing
284
+ pass
285
+ elif date_info_input.both_dates_in_same_block:
286
+ # Case 1: Currently same block -> Separate them
287
+ # "create new origin info element and add as child the reporting year element"
288
+ # "remove reporting year element from old origin info element"
289
+
290
+ # Original origin info
291
+ old_origin_info = d_reporting.getparent()
292
+
293
+ # Create new originInfo
294
+ # Where to add? Java: `document.getDocumentElement().appendChild(newOriginInfoNode)` -> To root (mods)
295
+ new_origin_info = etree.Element(TAG_ORIGIN_INFO)
296
+ input_root.append(new_origin_info)
297
+
298
+ # Move reporting year
299
+ # lxml move is just append to new parent (removes from old automatically)
300
+ new_origin_info.append(d_reporting)
301
+
302
+ else: # currently separate -> unite them
303
+ # "add reporting year element to the origin info element containing the issue date"
304
+ # "remove now empty origin info element which contained the reporting year element"
305
+
306
+ target_origin_info = d_issued.getparent()
307
+ old_host_origin_info = d_reporting.getparent()
308
+
309
+ target_origin_info.append(d_reporting)
310
+
311
+ # Remove old host if empty
312
+ self._remove_empty_parents(old_host_origin_info)
313
+
314
+ except ValueError as e:
315
+ log_data["warnings"].append(f"Date processing warning: {e}")
316
+
317
+ # Serialize
318
+ return etree.tostring(input_root, encoding='unicode', pretty_print=True), log_data
319
+
320
+ def _apply_move(self, root, source_def_list, dest_def_list, log_data):
321
+ # source_def_list is list of Elements defining the structure to find content
322
+ # We need to find the innermost element in source path in 'root'
323
+
324
+ # 1. Construct path string match logic is hard with just Elements.
325
+ # But we can find the node in 'root' that matches the path described by 'source_def_list'
326
+ # Java `createNodeInfo` uses path names.
327
+
328
+ # Effectively: find a node in root that has same path structure as source_def_list.
329
+ # The 'source_def_list' comes from config xml <element1><child>...</child></element1>
330
+
331
+ # Helper to get path name for the def list
332
+ # It seems def list is just a chain of elements?
333
+ # <element1><relatedItem type="host"><titleInfo><title/></titleInfo></relatedItem></element1>
334
+ # The list from findall("moveContent") -> element1 children.
335
+ # If element1 has one child `relatedItem`, and that has child `titleInfo`...
336
+ # We need to reconstruct the "NodeInfo.name" style string for this chain.
337
+ # source_def_list is list of children of <element1>. Usually just 1 top child.
338
+
339
+ if not source_def_list: return
340
+
341
+ # Helper to simulate NodeInfo generation for the config snippet
342
+ def get_snippet_path_name(elements):
343
+ # Deep traverse the first element until leaf
344
+ # Java logic: `nodeInfosSource = ModsXmlHelper.createNodeInfo(null, moveContent.sourceNodeList);`
345
+ # `innermostNode = nodeInfosSource.get(nodeInfosSource.size() - 1);`
346
+ pass
347
+
348
+ # Let's trust Java's logic: it matches based on `NodeInfo.name`.
349
+ # So we generate NodeInfo for config snippet.
350
+ # But config snippet is "detached" elements.
351
+
352
+ # We need a root for the snippet to pass to XmlHelper?
353
+ # We can wrap source_def_list in a dummy root?
354
+ dummy = etree.Element("dummy")
355
+ for e in source_def_list:
356
+ # We need to deep copy because append moves it
357
+ dummy.append(copy.deepcopy(e))
358
+
359
+ # But wait, `get_node_path_name` relies on parents.
360
+ # If we dump it in dummy, parent is dummy.
361
+ # We need path starting from valid MODS path?
362
+ # The config usually contains FULL path inside <mods> (implicit?).
363
+ # Java: `moveContent` uses `modsXmlHelper` which excludes `mods` tag from path.
364
+ # Example config: `<relatedItem type="host"><titleInfo><title>`
365
+ # This matches `mods/relatedItem/titleInfo/title`.
366
+ # So passing children of <element1> to dummy, calling get_all_nodes_info
367
+ # will give us paths like "relatedItem ... | titleInfo ... | title".
368
+ # We need the leaf one.
369
+
370
+ source_infos = XmlHelper.get_all_nodes_info(dummy)
371
+ if not source_infos: return
372
+ source_innermost = source_infos[-1]
373
+
374
+ # Now find this path in `root`
375
+ input_nodes_info = XmlHelper.get_all_nodes_info(root)
376
+
377
+ target_node = None
378
+ for info in input_nodes_info:
379
+ if info.name == source_innermost.name:
380
+ target_node = info.node
381
+ break
382
+
383
+ if target_node is None or not target_node.text:
384
+ return
385
+
386
+ content = target_node.text
387
+
388
+ # Now find destination
389
+ dummy_dest = etree.Element("dummy")
390
+ for e in dest_def_list:
391
+ dummy_dest.append(copy.deepcopy(e))
392
+
393
+ dest_infos = XmlHelper.get_all_nodes_info(dummy_dest)
394
+ if not dest_infos: return
395
+ dest_innermost_name = dest_infos[-1].name
396
+
397
+ # We need to insert this content at dest_innermost_name
398
+ # Java `insertElement` logic:
399
+ # Traverse destination path backwards. Find first part that exists in document.
400
+ # Insert remainder.
401
+
402
+ # We have dest_infos list which represents the FULL path chain.
403
+ # Check from end: if `info.name` exists in root?
404
+
405
+ # Input nodes map for fast lookup
406
+ input_path_map = {n.name: n.node for n in input_nodes_info}
407
+
408
+ insertion_point_node = None
409
+ remainder_start_index = 0
410
+
411
+ # dest_infos is ordered top-down (root to leaf).
412
+ # We want to find the DEEPEST existing node.
413
+
414
+ for i, info in enumerate(dest_infos):
415
+ if info.name in input_path_map:
416
+ insertion_point_node = input_path_map[info.name]
417
+ remainder_start_index = i + 1
418
+ else:
419
+ # This part doesn't exist, and subsequently children won't either
420
+ break
421
+
422
+ parent = insertion_point_node
423
+ if parent is None:
424
+ parent = root # Start at root if nothing matches (top level element missing)
425
+
426
+ # Construct remainder
427
+ current_parent = parent
428
+
429
+ # The elements in dest_infos are from dummy tree. We need to create NEW elements in input tree.
430
+ # We effectively clone the structure from `dest_infos[remainder_start_index:]`.
431
+
432
+ # But wait, `dest_infos` is flat list.
433
+ # We need hierarchy.
434
+
435
+ # If remainder is empty, it means leaf already exists. We update text.
436
+ if remainder_start_index >= len(dest_infos):
437
+ current_parent.text = content
438
+ else:
439
+ # We need to build the missing chain.
440
+ # The `dest_infos` list contains NodeInfos. We can look at `info.node` to get tag/attribs.
441
+
442
+ # The structure of `dest_infos` for `<A><B><C>` is [`A`, `A|B`, `A|B|C`]. (if traversing depth first)
443
+ # We can't easily jump from `A` to `B` just by list index if there are siblings.
444
+ # But here config is usually linear path.
445
+
446
+ for i in range(remainder_start_index, len(dest_infos)):
447
+ info = dest_infos[i]
448
+ # Create element
449
+ # info.node is the element in dummy tree
450
+ new_elem = etree.Element(etree.QName(info.node).localname)
451
+ # Copy attribs
452
+ new_elem.attrib.update(info.node.attrib)
453
+
454
+ current_parent.append(new_elem)
455
+ current_parent = new_elem
456
+
457
+ # Set text on the last one
458
+ current_parent.text = content
459
+
460
+ # Clear text from source
461
+ target_node.text = ""
462
+
463
+ # Log structured move
464
+ log_data["moves"].append({
465
+ "source": source_innermost.name,
466
+ "dest": dest_innermost_name,
467
+ "label": dest_innermost_name.split(" | ")[-1],
468
+ "value": content,
469
+ "summary": f"Moved {source_innermost.name.split(' | ')[-1]} content to {dest_innermost_name.split(' | ')[-1]}"
470
+ })
471
+
472
+
473
+ def _remove_empty_parents(self, element):
474
+ if element is None: return
475
+
476
+ # Check if empty: no text (strip), no children
477
+ has_text = element.text and element.text.strip()
478
+ if not has_text and len(element) == 0:
479
+ parent = element.getparent()
480
+ if parent is not None:
481
+ parent.remove(element)
482
+ self._remove_empty_parents(parent)
483
+
484
+ import os
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- altair
2
- pandas
3
- streamlit
 
1
+ streamlit
2
+ lxml
3
+ requests
utils.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from lxml import etree
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Dict, Tuple
5
+
6
+ # Constants for DORA XML elements
7
+ TAG_GENRE = "genre"
8
+ TAG_MODS = "mods"
9
+ TAG_ORIGIN_INFO = "originInfo"
10
+ TAG_DATE_ISSUED = "dateIssued"
11
+ TAG_DATE_OTHER = "dateOther"
12
+ ATTR_REPORTING_YEAR = "reporting year"
13
+
14
+ @dataclass
15
+ class DateInfo:
16
+ both_dates_in_same_block: bool
17
+ date_issued_node: Optional[etree._Element]
18
+ reporting_year_node: Optional[etree._Element]
19
+
20
+ @dataclass
21
+ class NodeInfo:
22
+ node: etree._Element
23
+ # Parent is implicit in lxml via getparent()
24
+ name: str # The computed "path name" for comparison
25
+ has_child_elements: bool
26
+
27
+ class XmlHelper:
28
+ @staticmethod
29
+ def parse_xml(file_path_or_content) -> etree._ElementTree:
30
+ """Parses an XML file or content."""
31
+ parser = etree.XMLParser(remove_blank_text=True)
32
+ try:
33
+ if isinstance(file_path_or_content, str) and os.path.exists(file_path_or_content):
34
+ tree = etree.parse(file_path_or_content, parser)
35
+ else:
36
+ if isinstance(file_path_or_content, bytes):
37
+ tree = etree.fromstring(file_path_or_content, parser).getroottree()
38
+ else:
39
+ tree = etree.fromstring(file_path_or_content.encode('utf-8'), parser).getroottree()
40
+ return tree
41
+ except Exception as e:
42
+ raise ValueError(f"Error parsing XML: {e}")
43
+
44
+ @staticmethod
45
+ def get_genre_node(root: etree._Element) -> Optional[etree._Element]:
46
+ """Finds the genre element."""
47
+ # Use simple local-name matching to avoid namespace headaches
48
+ for elem in root.iter():
49
+ if etree.QName(elem).localname == TAG_GENRE:
50
+ return elem
51
+ return None
52
+
53
+ @staticmethod
54
+ def find_date_nodes(root: etree._Element) -> DateInfo:
55
+ """Finds dateIssued and reporting year nodes."""
56
+ date_issued = None
57
+ reporting_year = None
58
+ both_in_same = False
59
+
60
+ origin_infos = []
61
+ for elem in root.iter():
62
+ if etree.QName(elem).localname == TAG_ORIGIN_INFO:
63
+ origin_infos.append(elem)
64
+
65
+ if not origin_infos:
66
+ raise ValueError("No originInfo elements found in MODS XML")
67
+
68
+ for origin_info in origin_infos:
69
+ has_issued = False
70
+ has_reporting = False
71
+
72
+ # Reset for each block to check if THIS block has both
73
+ current_date_issued = None
74
+ current_reporting_year = None
75
+
76
+ for child in origin_info:
77
+ localname = etree.QName(child).localname
78
+ if localname == TAG_DATE_ISSUED:
79
+ has_issued = True
80
+ current_date_issued = child
81
+ elif localname == TAG_DATE_OTHER:
82
+ # Check attributes
83
+ for attr_name, attr_value in child.attrib.items():
84
+ if attr_value == ATTR_REPORTING_YEAR:
85
+ has_reporting = True
86
+ current_reporting_year = child
87
+ break
88
+
89
+ if has_issued:
90
+ date_issued = current_date_issued
91
+ if has_reporting:
92
+ reporting_year = current_reporting_year
93
+
94
+ if has_issued and has_reporting:
95
+ both_in_same = True
96
+ # Java code breaks on first occurrence of both in same
97
+ break
98
+
99
+ return DateInfo(both_in_same, date_issued, reporting_year)
100
+
101
+ @staticmethod
102
+ def get_node_path_name(element: etree._Element, parent_path: str = "") -> str:
103
+ """Generates a unique-ish name for the node based on tag and path."""
104
+ tag = etree.QName(element).localname
105
+
106
+ if tag == "mods":
107
+ return ""
108
+
109
+ name = tag
110
+ if element.attrib:
111
+ # Sort attribs for consistency
112
+ for k, v in sorted(element.attrib.items()):
113
+ name += f" [{k}={v}]"
114
+
115
+ if parent_path:
116
+ return f"{parent_path} | {name}"
117
+ return name
118
+
119
+ @staticmethod
120
+ def get_all_nodes_info(root: etree._Element) -> List[NodeInfo]:
121
+ """Flattens the XML structure to a list of NodeInfo."""
122
+ nodes = []
123
+
124
+ def traverse(element, parent_path):
125
+ # Calculate path for current element
126
+ # Note: The root 'mods' element usually has empty path name in Java logic
127
+
128
+ current_path = XmlHelper.get_node_path_name(element, parent_path)
129
+
130
+ has_child_elements = False
131
+ for child in element:
132
+ if isinstance(child, etree._Element):
133
+ has_child_elements = True
134
+ # Recurse
135
+ traverse(child, current_path)
136
+
137
+ # Creating info for CURRENT node
138
+ # We skip adding 'mods' root itself to the list if its path is empty?
139
+ # Java: "if (currentNodeName.equalsIgnoreCase(TAGNAME_MODS)) ... resultList.add(nodeInfo)"
140
+ # It ADDS it, but name is empty?
141
+ # Java: "String nodeName = ... ? "" : currentNodeName"
142
+ # Java: "if (!parentNodeName.isEmpty()) nodeName = parentNodeName + ' | ' + nodeName"
143
+
144
+ # If it's root mods, parent_path is None/Empty. nodeName is "".
145
+ # If it's child of mods, parent_path is "". nodeName is "genre". Result "genre".
146
+ # So root mods is added with name "".
147
+
148
+ # Note: The comparison logic later uses these names.
149
+ # If template has root mods (""), input has root mods (""). They match.
150
+ # So we should include it.
151
+
152
+ nodes.append(NodeInfo(element, current_path, has_child_elements))
153
+
154
+ # Root typically 'mods'
155
+ # Pass parent_path="" implies we are at top.
156
+ # But wait, get_node_path_name for root mods returns "".
157
+ # For child 'genre', parent_path is "". get_node_path_name returns "genre".
158
+ # This matches Java logic.
159
+
160
+ traverse(root, "")
161
+ return nodes