File size: 24,041 Bytes
9336543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
import os
import copy
from lxml import etree
from typing import Dict, List, Tuple, Set
from utils import XmlHelper, DateInfo, NodeInfo, TAG_GENRE, TAG_ORIGIN_INFO, TAG_DATE_ISSUED, TAG_DATE_OTHER, ATTR_REPORTING_YEAR

class ModsConverter:
    def __init__(self):
        self.move_config = {}

    def load_config(self, config_path: str):
        """Loads configuration for moving content."""
        if not os.path.exists(config_path):
            return
        
        parser = etree.XMLParser(remove_blank_text=True)
        try:
            tree = etree.parse(config_path, parser)
            root = tree.getroot()
            
            for conversion in root.findall("pubTypeConversion"):
                p1 = conversion.find("pubType1")
                p2 = conversion.find("pubType2")
                if p1 is None or p2 is None: continue
                
                pt1 = p1.text
                pt2 = p2.text
                
                moves = []
                for mc in conversion.findall("moveContent"):
                    e1 = mc.find("element1")
                    e2 = mc.find("element2")
                    if e1 is not None and e2 is not None:
                         # Store as list of elements to traverse matching path
                         moves.append((list(e1), list(e2)))
                
                if moves:
                    self.move_config[self._get_key(pt1, pt2)] = moves
                    # Reverse
                    self.move_config[self._get_key(pt2, pt1)] = [(m[1], m[0]) for m in moves]
                    
        except Exception as e:
            print(f"Error loading config: {e}")

    def _get_key(self, p1, p2):
        if not p1 or not p2: return ""
        return f"{p1.lower()}{p2.lower()}"

    def convert(self, input_xml_str: str, template_xml_path: str) -> Tuple[str, dict]:
        """

        Converts the input XML based on the template.

        Returns the result XML string and a structured log dictionary.

        """
        log_data = {
            "old_genre": "",
            "new_genre": "",
            "moves": [],
            "additions": [],
            "deletions": [],
            "warnings": []
        }
        
        # Parse inputs
        try:
            input_tree = XmlHelper.parse_xml(input_xml_str)
            input_root = input_tree.getroot()
            template_tree = XmlHelper.parse_xml(template_xml_path)
            template_root = template_tree.getroot()
        except ValueError as e:
            log_data["warnings"].append(f"Error parsing XML: {e}")
            return "", log_data
        
        # 1. Exchange Genre
        input_genre = XmlHelper.get_genre_node(input_root)
        template_genre = XmlHelper.get_genre_node(template_root)
        
        if input_genre is None or template_genre is None:
            log_data["warnings"].append("Missing genre element in input or template.")
            return "", log_data
            
        old_genre = input_genre.text
        new_genre = template_genre.text
        
        log_data["old_genre"] = old_genre
        log_data["new_genre"] = new_genre
        
        # Update genre text and attributes
        input_genre.text = new_genre
        input_genre.attrib.clear()
        input_genre.attrib.update(template_genre.attrib)
        
        # 2. Move Content
        key = self._get_key(old_genre, new_genre)
        if key in self.move_config:
            moves = self.move_config[key]

            for source_def, dest_def in moves:
                self._apply_move(input_root, source_def, dest_def, log_data)
        
        # 3. Delete Extra Content
        # We need a set of all valid paths from template
        template_nodes_info = XmlHelper.get_all_nodes_info(template_root)
        template_paths = {n.name for n in template_nodes_info}
        
        # Re-scan input nodes after move
        input_nodes_info = XmlHelper.get_all_nodes_info(input_root)
        
        
        # We iterate and remove.
        # Logic: If node path not in template, delete it.
        # "check which nodes of input file are not contained in template - and delete them"
        # "if parent node is empty now, delete it too"
        
        # We should iterate such that we don't try to access removed nodes.
        # But `input_nodes_info` creates a snapshot.
        # Checking `node.getparent()` will return None if already removed? 
        # Actually lxml keeps parent ref even if removed from tree? No, `getparent()` returns None if removed.
        
        # We need to process this carefully. Java iterates the *snapshot* list.
        # "check if parent node still exists - because it could have been deleted in a step before"
        
        for node_info in input_nodes_info:
             # Skip empty names (like root if it resolved to empty)
             if not node_info.name: continue
             
             # Case insensitive check for paths
             template_paths_lower = [p.lower() for p in template_paths]
             is_in_template = node_info.name.lower() in template_paths_lower
             
             # Specific loose matching rules
             if not is_in_template:
                 try:
                     tag = etree.QName(node_info.node).localname.lower()
                     
                     # Rule 1: Allow 'affiliation' with any attributes if a bare 'affiliation' exists in template
                     # AND parent path matches.
                     if tag == 'affiliation':
                         # Construct relaxed path: remove attributes from the last segment
                         # Format: "parent | affiliation [type=group]" -> "parent | affiliation"
                         last_sep = node_info.name.rfind(" | ")
                         if last_sep != -1:
                             parent_part = node_info.name[:last_sep]
                             # We assume the parent path part is correct (since parent wasn't deleted if we are here... 
                             # well, actually we are iterating a snapshot, so parent MIGHT be deleted, 
                             # but we check parent is not none later on deletion)
                             
                             # Construct potential template path: parent + strict tag name
                             # We use the tag name from the node, but stripped of attributes
                             relaxed_candidate = f"{parent_part} | {etree.QName(node_info.node).localname}"
                             
                             if relaxed_candidate.lower() in template_paths_lower:
                                 is_in_template = True
                     
                     # Rule 2: Always preserve 'alternativeName' and its children if parent 'name' is preserved
                     # (implied by parent path match, but we need to check if we are Inside an alternativeName tree)
                     # Or just 'alternativeName' tag itself.
                     # The path for children would be "name | alternativeName | namePart"
                     
                     # Check if current tag is alternativeName OR if any parent in path is alternativeName
                     # node_info.name contains full path.
                     if 'alternativeName' in node_info.name:
                         # We need to be careful not to preserve it if the parent NAME itself was deleted?
                         # But we are iterating inputs. Parents are processed? 
                         # Actually we iterate flat list. If parent was deleted, we might validly delete child.
                         # But here we are deciding if we SHOULD delete.
                         
                         # If the path contains alternativeName, we check if the base path (up to name) is valid?
                         # Simpler: If it's alternativeName or child of it, Assume preserved IF parent exists.
                         # The loop logic "input_nodes_info" contains all nodes.
                         # If we say `is_in_template = True`, we keep it.
                         # If parent `name` was removed, then `alternativeName` would be removed automatically?
                         # No, `parent.remove(node)` removes it from tree.
                         # But we are iterating a snapshot.
                         # `if parent is not None:` check handles if parent was already removed/detached?
                         # Yes, if `name` was removed, `alternativeName.getparent()` (which is that name node) 
                         # is still that node object (it's consistent in lxml), BUT that name node is no longer in tree.
                         # Wait, if `name` is removed from `mods`, `name.getparent()` might be None?
                         # lxml: "When an element is removed from its parent, it is not destroyed... getparent() returns None"
                         # So if parent `name` was removed in previous iteration, `parent` here will be None (or the name node, but name node's parent is None).
                         # Actually `node.getparent()` returns the parent element. 
                         # If parent element was removed from ITS parent, `node.getparent()` still returns the parent element.
                         # It's only if `node` was removed from `parent` that `getparent()` is None.
                         
                         # So we need to ensure we don't keep it if parent is "gone" effectively?
                         # But the standard logic deletes children if parent is deleted?
                         # "if parent node is empty now, delete it too" - that's post-deletion cleanup.
                         
                         # If we mark `alternativeName` as "in template" (preserved), we just DON'T delete it explicitly here.
                         # If its parent `name` was deleted, then `alternativeName` effectively goes with it.
                         # So we just need to say: "Don't delete alternativeName just because it's missing from template".
                         
                         is_in_template = True

                 except:
                     pass

             if not is_in_template:
                 # Node isn't in template.
                 node = node_info.node
                 parent = node.getparent()
                 
                 if parent is not None:
                     # Log if it has content
                     text = node.text
                     if text and text.strip() and not node_info.has_child_elements:
                          label = node_info.name.split(" | ")[-1]
                          log_data["deletions"].append({
                              "path": node_info.name, 
                              "label": label,
                              "value": text.strip()
                          })
                     
                     # Remove
                     parent.remove(node)
                     
                     # Remove empty parents
                     self._remove_empty_parents(parent)


        # 4. Sync Template Defaults (Additions)
        # Anything in template that has text but is missing in input should be added
        input_nodes_info_final = XmlHelper.get_all_nodes_info(input_root)
        input_paths_final = {n.name.lower() for n in input_nodes_info_final}
        
        for t_info in template_nodes_info:
            if t_info.name.lower() not in input_paths_final:
                t_node = t_info.node
                # Only sync if it has actual text (default value)
                if t_node.text and t_node.text.strip():
                    # Construct the path chain from t_node to root
                    path_elements = []
                    curr = t_node
                    while curr is not None and curr != template_root:
                        path_elements.insert(0, (curr.tag, curr.attrib))
                        curr = curr.getparent()
                    
                    if path_elements:
                        # Find insertion point
                        current_parent = input_root
                        for tag, attrib in path_elements:
                            match = None
                            for child in current_parent:
                                # Loose match for sync purposes
                                if child.tag == tag:
                                    match = child
                                    break
                            
                            if match is not None:
                                current_parent = match
                            else:
                                # Create new
                                new_elem = etree.Element(tag)
                                new_elem.attrib.update(attrib)
                                current_parent.append(new_elem)
                                current_parent = new_elem
                        
                        # Set text
                        current_parent.text = t_node.text
                        
                        # Better label for addition
                        label = t_info.name.split(" | ")[-1]
                        log_data["additions"].append({
                            "path": t_info.name, 
                            "label": label,
                            "value": t_node.text,
                            "summary": f"Set default {label} to '{t_node.text}'"
                        })
                        # Add to final paths to avoid duplicates if siblings match
                        input_paths_final.add(t_info.name.lower())

        # 5. Handle Dates
        try:
            date_info_input = XmlHelper.find_date_nodes(input_root)
            date_info_template = XmlHelper.find_date_nodes(template_root)
            
            if date_info_input.both_dates_in_same_block != date_info_template.both_dates_in_same_block:
                
                # We need nodes to manipulate.
                d_issued = date_info_input.date_issued_node
                d_reporting = date_info_input.reporting_year_node
                
                if d_issued is None or d_reporting is None:
                    # Can't manipulate if missing
                     pass
                elif date_info_input.both_dates_in_same_block:
                    # Case 1: Currently same block -> Separate them
                    # "create new origin info element and add as child the reporting year element"
                    # "remove reporting year element from old origin info element"
                    
                    # Original origin info
                    old_origin_info = d_reporting.getparent()
                    
                    # Create new originInfo
                    # Where to add? Java: `document.getDocumentElement().appendChild(newOriginInfoNode)` -> To root (mods)
                    new_origin_info = etree.Element(TAG_ORIGIN_INFO)
                    input_root.append(new_origin_info)
                    
                    # Move reporting year
                    # lxml move is just append to new parent (removes from old automatically)
                    new_origin_info.append(d_reporting)
                    
                else: # currently separate -> unite them
                    # "add reporting year element to the origin info element containing the issue date"
                    # "remove now empty origin info element which contained the reporting year element"
                    
                    target_origin_info = d_issued.getparent()
                    old_host_origin_info = d_reporting.getparent()
                    
                    target_origin_info.append(d_reporting)
                    
                    # Remove old host if empty
                    self._remove_empty_parents(old_host_origin_info)

        except ValueError as e:
            log_data["warnings"].append(f"Date processing warning: {e}")

        # Serialize
        return etree.tostring(input_root, encoding='unicode', pretty_print=True), log_data

    def _apply_move(self, root, source_def_list, dest_def_list, log_data):
        # source_def_list is list of Elements defining the structure to find content
        # We need to find the innermost element in source path in 'root'
        
        # 1. Construct path string match logic is hard with just Elements.
        # But we can find the node in 'root' that matches the path described by 'source_def_list'
        # Java `createNodeInfo` uses path names.
        
        # Effectively: find a node in root that has same path structure as source_def_list.
        # The 'source_def_list' comes from config xml <element1><child>...</child></element1>
        
        # Helper to get path name for the def list
        # It seems def list is just a chain of elements?
        # <element1><relatedItem type="host"><titleInfo><title/></titleInfo></relatedItem></element1>
        # The list from findall("moveContent") -> element1 children.
        # If element1 has one child `relatedItem`, and that has child `titleInfo`...
        # We need to reconstruct the "NodeInfo.name" style string for this chain.
        # source_def_list is list of children of <element1>. Usually just 1 top child.
        
        if not source_def_list: return
        
        # Helper to simulate NodeInfo generation for the config snippet
        def get_snippet_path_name(elements):
             # Deep traverse the first element until leaf
             # Java logic: `nodeInfosSource = ModsXmlHelper.createNodeInfo(null, moveContent.sourceNodeList);`
             # `innermostNode = nodeInfosSource.get(nodeInfosSource.size() - 1);`
             pass
        
        # Let's trust Java's logic: it matches based on `NodeInfo.name`.
        # So we generate NodeInfo for config snippet.
        # But config snippet is "detached" elements.
        
        # We need a root for the snippet to pass to XmlHelper?
        # We can wrap source_def_list in a dummy root?
        dummy = etree.Element("dummy")
        for e in source_def_list:
             # We need to deep copy because append moves it
             dummy.append(copy.deepcopy(e))
             
        # But wait, `get_node_path_name` relies on parents.
        # If we dump it in dummy, parent is dummy.
        # We need path starting from valid MODS path?
        # The config usually contains FULL path inside <mods> (implicit?).
        # Java: `moveContent` uses `modsXmlHelper` which excludes `mods` tag from path.
        # Example config: `<relatedItem type="host"><titleInfo><title>`
        # This matches `mods/relatedItem/titleInfo/title`.
        # So passing children of <element1> to dummy, calling get_all_nodes_info
        # will give us paths like "relatedItem ... | titleInfo ... | title".
        # We need the leaf one.
        
        source_infos = XmlHelper.get_all_nodes_info(dummy)
        if not source_infos: return
        source_innermost = source_infos[-1]
        
        # Now find this path in `root`
        input_nodes_info = XmlHelper.get_all_nodes_info(root)
        
        target_node = None
        for info in input_nodes_info:
            if info.name == source_innermost.name:
                target_node = info.node
                break
        
        if target_node is None or not target_node.text:
             return 

        content = target_node.text
        
        # Now find destination
        dummy_dest = etree.Element("dummy")
        for e in dest_def_list:
             dummy_dest.append(copy.deepcopy(e))
             
        dest_infos = XmlHelper.get_all_nodes_info(dummy_dest)
        if not dest_infos: return
        dest_innermost_name = dest_infos[-1].name
        
        # We need to insert this content at dest_innermost_name
        # Java `insertElement` logic:
        # Traverse destination path backwards. Find first part that exists in document.
        # Insert remainder.
        
        # We have dest_infos list which represents the FULL path chain.
        # Check from end: if `info.name` exists in root?
        
        # Input nodes map for fast lookup
        input_path_map = {n.name: n.node for n in input_nodes_info}
        
        insertion_point_node = None
        remainder_start_index = 0
        
        # dest_infos is ordered top-down (root to leaf).
        # We want to find the DEEPEST existing node.
        
        for i, info in enumerate(dest_infos):
             if info.name in input_path_map:
                  insertion_point_node = input_path_map[info.name]
                  remainder_start_index = i + 1
             else:
                  # This part doesn't exist, and subsequently children won't either
                  break
        
        parent = insertion_point_node
        if parent is None:
             parent = root # Start at root if nothing matches (top level element missing)
        
        # Construct remainder
        current_parent = parent
        
        # The elements in dest_infos are from dummy tree. We need to create NEW elements in input tree.
        # We effectively clone the structure from `dest_infos[remainder_start_index:]`.
        
        # But wait, `dest_infos` is flat list.
        # We need hierarchy.
        
        # If remainder is empty, it means leaf already exists. We update text.
        if remainder_start_index >= len(dest_infos):
             current_parent.text = content
        else:
             # We need to build the missing chain.
             # The `dest_infos` list contains NodeInfos. We can look at `info.node` to get tag/attribs.
             
             # The structure of `dest_infos` for `<A><B><C>` is [`A`, `A|B`, `A|B|C`]. (if traversing depth first)
             # We can't easily jump from `A` to `B` just by list index if there are siblings.
             # But here config is usually linear path.
             
             for i in range(remainder_start_index, len(dest_infos)):
                  info = dest_infos[i]
                  # Create element
                  # info.node is the element in dummy tree
                  new_elem = etree.Element(etree.QName(info.node).localname)
                  # Copy attribs
                  new_elem.attrib.update(info.node.attrib)
                  
                  current_parent.append(new_elem)
                  current_parent = new_elem
             
             # Set text on the last one
             current_parent.text = content

        # Clear text from source
        target_node.text = ""
        
        # Log structured move
        log_data["moves"].append({
            "source": source_innermost.name,
            "dest": dest_innermost_name,
            "label": dest_innermost_name.split(" | ")[-1],
            "value": content,
            "summary": f"Moved {source_innermost.name.split(' | ')[-1]} content to {dest_innermost_name.split(' | ')[-1]}"
        })


    def _remove_empty_parents(self, element):
         if element is None: return
         
         # Check if empty: no text (strip), no children
         has_text = element.text and element.text.strip()
         if not has_text and len(element) == 0:
              parent = element.getparent()
              if parent is not None:
                   parent.remove(element)
                   self._remove_empty_parents(parent)

import os