Spaces:

andrehoffmann80
/

dora_pubtype

Sleeping

File size: 24,041 Bytes
import os
import copy
from lxml import etree
from typing import Dict, List, Tuple, Set
from utils import XmlHelper, DateInfo, NodeInfo, TAG_GENRE, TAG_ORIGIN_INFO, TAG_DATE_ISSUED, TAG_DATE_OTHER, ATTR_REPORTING_YEAR

class ModsConverter:
    def __init__(self):
        self.move_config = {}

    def load_config(self, config_path: str):
        """Loads configuration for moving content."""
        if not os.path.exists(config_path):
            return
        
        parser = etree.XMLParser(remove_blank_text=True)
        try:
            tree = etree.parse(config_path, parser)
            root = tree.getroot()
            
            for conversion in root.findall("pubTypeConversion"):
                p1 = conversion.find("pubType1")
                p2 = conversion.find("pubType2")
                if p1 is None or p2 is None: continue
                
                pt1 = p1.text
                pt2 = p2.text
                
                moves = []
                for mc in conversion.findall("moveContent"):
                    e1 = mc.find("element1")
                    e2 = mc.find("element2")
                    if e1 is not None and e2 is not None:
                         # Store as list of elements to traverse matching path
                         moves.append((list(e1), list(e2)))
                
                if moves:
                    self.move_config[self._get_key(pt1, pt2)] = moves
                    # Reverse
                    self.move_config[self._get_key(pt2, pt1)] = [(m[1], m[0]) for m in moves]
                    
        except Exception as e:
            print(f"Error loading config: {e}")

    def _get_key(self, p1, p2):
        if not p1 or not p2: return ""
        return f"{p1.lower()}{p2.lower()}"

    def convert(self, input_xml_str: str, template_xml_path: str) -> Tuple[str, dict]:
        """

        Converts the input XML based on the template.

        Returns the result XML string and a structured log dictionary.

        """
        log_data = {
            "old_genre": "",
            "new_genre": "",
            "moves": [],
            "additions": [],
            "deletions": [],
            "warnings": []
        }
        
        # Parse inputs
        try:
            input_tree = XmlHelper.parse_xml(input_xml_str)
            input_root = input_tree.getroot()
            template_tree = XmlHelper.parse_xml(template_xml_path)
            template_root = template_tree.getroot()
        except ValueError as e:
            log_data["warnings"].append(f"Error parsing XML: {e}")
            return "", log_data
        
        # 1. Exchange Genre
        input_genre = XmlHelper.get_genre_node(input_root)
        template_genre = XmlHelper.get_genre_node(template_root)
        
        if input_genre is None or template_genre is None:
            log_data["warnings"].append("Missing genre element in input or template.")
            return "", log_data
            
        old_genre = input_genre.text
        new_genre = template_genre.text
        
        log_data["old_genre"] = old_genre
        log_data["new_genre"] = new_genre
        
        # Update genre text and attributes
        input_genre.text = new_genre
        input_genre.attrib.clear()
        input_genre.attrib.update(template_genre.attrib)
        
        # 2. Move Content
        key = self._get_key(old_genre, new_genre)
        if key in self.move_config:
            moves = self.move_config[key]

            for source_def, dest_def in moves:
                self._apply_move(input_root, source_def, dest_def, log_data)
        
        # 3. Delete Extra Content
        # We need a set of all valid paths from template
        template_nodes_info = XmlHelper.get_all_nodes_info(template_root)
        template_paths = {n.name for n in template_nodes_info}
        
        # Re-scan input nodes after move
        input_nodes_info = XmlHelper.get_all_nodes_info(input_root)
        
        
        # We iterate and remove.
        # Logic: If node path not in template, delete it.
        # "check which nodes of input file are not contained in template - and delete them"
        # "if parent node is empty now, delete it too"
        
        # We should iterate such that we don't try to access removed nodes.
        # But `input_nodes_info` creates a snapshot.
        # Checking `node.getparent()` will return None if already removed? 
        # Actually lxml keeps parent ref even if removed from tree? No, `getparent()` returns None if removed.
        
        # We need to process this carefully. Java iterates the *snapshot* list.
        # "check if parent node still exists - because it could have been deleted in a step before"
        
        for node_info in input_nodes_info:
             # Skip empty names (like root if it resolved to empty)
             if not node_info.name: continue
             
             # Case insensitive check for paths
             template_paths_lower = [p.lower() for p in template_paths]
             is_in_template = node_info.name.lower() in template_paths_lower
             
             # Specific loose matching rules
             if not is_in_template:
                 try:
                     tag = etree.QName(node_info.node).localname.lower()
                     
                     # Rule 1: Allow 'affiliation' with any attributes if a bare 'affiliation' exists in template
                     # AND parent path matches.
                     if tag == 'affiliation':
                         # Construct relaxed path: remove attributes from the last segment
                         # Format: "parent | affiliation [type=group]" -> "parent | affiliation"
                         last_sep = node_info.name.rfind(" | ")
                         if last_sep != -1:
                             parent_part = node_info.name[:last_sep]
                             # We assume the parent path part is correct (since parent wasn't deleted if we are here... 
                             # well, actually we are iterating a snapshot, so parent MIGHT be deleted, 
                             # but we check parent is not none later on deletion)
                             
                             # Construct potential template path: parent + strict tag name
                             # We use the tag name from the node, but stripped of attributes
                             relaxed_candidate = f"{parent_part} | {etree.QName(node_info.node).localname}"
                             
                             if relaxed_candidate.lower() in template_paths_lower:
                                 is_in_template = True
                     
                     # Rule 2: Always preserve 'alternativeName' and its children if parent 'name' is preserved
                     # (implied by parent path match, but we need to check if we are Inside an alternativeName tree)
                     # Or just 'alternativeName' tag itself.
                     # The path for children would be "name | alternativeName | namePart"
                     
                     # Check if current tag is alternativeName OR if any parent in path is alternativeName
                     # node_info.name contains full path.
                     if 'alternativeName' in node_info.name:
                         # We need to be careful not to preserve it if the parent NAME itself was deleted?
                         # But we are iterating inputs. Parents are processed? 
                         # Actually we iterate flat list. If parent was deleted, we might validly delete child.
                         # But here we are deciding if we SHOULD delete.
                         
                         # If the path contains alternativeName, we check if the base path (up to name) is valid?
                         # Simpler: If it's alternativeName or child of it, Assume preserved IF parent exists.
                         # The loop logic "input_nodes_info" contains all nodes.
                         # If we say `is_in_template = True`, we keep it.
                         # If parent `name` was removed, then `alternativeName` would be removed automatically?
                         # No, `parent.remove(node)` removes it from tree.
                         # But we are iterating a snapshot.
                         # `if parent is not None:` check handles if parent was already removed/detached?
                         # Yes, if `name` was removed, `alternativeName.getparent()` (which is that name node) 
                         # is still that node object (it's consistent in lxml), BUT that name node is no longer in tree.
                         # Wait, if `name` is removed from `mods`, `name.getparent()` might be None?
                         # lxml: "When an element is removed from its parent, it is not destroyed... getparent() returns None"
                         # So if parent `name` was removed in previous iteration, `parent` here will be None (or the name node, but name node's parent is None).
                         # Actually `node.getparent()` returns the parent element. 
                         # If parent element was removed from ITS parent, `node.getparent()` still returns the parent element.
                         # It's only if `node` was removed from `parent` that `getparent()` is None.
                         
                         # So we need to ensure we don't keep it if parent is "gone" effectively?
                         # But the standard logic deletes children if parent is deleted?
                         # "if parent node is empty now, delete it too" - that's post-deletion cleanup.
                         
                         # If we mark `alternativeName` as "in template" (preserved), we just DON'T delete it explicitly here.
                         # If its parent `name` was deleted, then `alternativeName` effectively goes with it.
                         # So we just need to say: "Don't delete alternativeName just because it's missing from template".
                         
                         is_in_template = True

                 except:
                     pass

             if not is_in_template:
                 # Node isn't in template.
                 node = node_info.node
                 parent = node.getparent()
                 
                 if parent is not None:
                     # Log if it has content
                     text = node.text
                     if text and text.strip() and not node_info.has_child_elements:
                          label = node_info.name.split(" | ")[-1]
                          log_data["deletions"].append({
                              "path": node_info.name, 
                              "label": label,
                              "value": text.strip()
                          })
                     
                     # Remove
                     parent.remove(node)
                     
                     # Remove empty parents
                     self._remove_empty_parents(parent)


        # 4. Sync Template Defaults (Additions)
        # Anything in template that has text but is missing in input should be added
        input_nodes_info_final = XmlHelper.get_all_nodes_info(input_root)
        input_paths_final = {n.name.lower() for n in input_nodes_info_final}
        
        for t_info in template_nodes_info:
            if t_info.name.lower() not in input_paths_final:
                t_node = t_info.node
                # Only sync if it has actual text (default value)
                if t_node.text and t_node.text.strip():
                    # Construct the path chain from t_node to root
                    path_elements = []
                    curr = t_node
                    while curr is not None and curr != template_root:
                        path_elements.insert(0, (curr.tag, curr.attrib))
                        curr = curr.getparent()
                    
                    if path_elements:
                        # Find insertion point
                        current_parent = input_root
                        for tag, attrib in path_elements:
                            match = None
                            for child in current_parent:
                                # Loose match for sync purposes
                                if child.tag == tag:
                                    match = child
                                    break
                            
                            if match is not None:
                                current_parent = match
                            else:
                                # Create new
                                new_elem = etree.Element(tag)
                                new_elem.attrib.update(attrib)
                                current_parent.append(new_elem)
                                current_parent = new_elem
                        
                        # Set text
                        current_parent.text = t_node.text
                        
                        # Better label for addition
                        label = t_info.name.split(" | ")[-1]
                        log_data["additions"].append({
                            "path": t_info.name, 
                            "label": label,
                            "value": t_node.text,
                            "summary": f"Set default {label} to '{t_node.text}'"
                        })
                        # Add to final paths to avoid duplicates if siblings match
                        input_paths_final.add(t_info.name.lower())

        # 5. Handle Dates
        try:
            date_info_input = XmlHelper.find_date_nodes(input_root)
            date_info_template = XmlHelper.find_date_nodes(template_root)
            
            if date_info_input.both_dates_in_same_block != date_info_template.both_dates_in_same_block:
                
                # We need nodes to manipulate.
                d_issued = date_info_input.date_issued_node
                d_reporting = date_info_input.reporting_year_node
                
                if d_issued is None or d_reporting is None:
                    # Can't manipulate if missing
                     pass
                elif date_info_input.both_dates_in_same_block:
                    # Case 1: Currently same block -> Separate them
                    # "create new origin info element and add as child the reporting year element"
                    # "remove reporting year element from old origin info element"
                    
                    # Original origin info
                    old_origin_info = d_reporting.getparent()
                    
                    # Create new originInfo
                    # Where to add? Java: `document.getDocumentElement().appendChild(newOriginInfoNode)` -> To root (mods)
                    new_origin_info = etree.Element(TAG_ORIGIN_INFO)
                    input_root.append(new_origin_info)
                    
                    # Move reporting year
                    # lxml move is just append to new parent (removes from old automatically)
                    new_origin_info.append(d_reporting)
                    
                else: # currently separate -> unite them
                    # "add reporting year element to the origin info element containing the issue date"
                    # "remove now empty origin info element which contained the reporting year element"
                    
                    target_origin_info = d_issued.getparent()
                    old_host_origin_info = d_reporting.getparent()
                    
                    target_origin_info.append(d_reporting)
                    
                    # Remove old host if empty
                    self._remove_empty_parents(old_host_origin_info)

        except ValueError as e:
            log_data["warnings"].append(f"Date processing warning: {e}")

        # Serialize
        return etree.tostring(input_root, encoding='unicode', pretty_print=True), log_data

    def _apply_move(self, root, source_def_list, dest_def_list, log_data):
        # source_def_list is list of Elements defining the structure to find content
        # We need to find the innermost element in source path in 'root'
        
        # 1. Construct path string match logic is hard with just Elements.
        # But we can find the node in 'root' that matches the path described by 'source_def_list'
        # Java `createNodeInfo` uses path names.
        
        # Effectively: find a node in root that has same path structure as source_def_list.
        # The 'source_def_list' comes from config xml <element1><child>...</child></element1>
        
        # Helper to get path name for the def list
        # It seems def list is just a chain of elements?
        # <element1><relatedItem type="host"><titleInfo><title/></titleInfo></relatedItem></element1>
        # The list from findall("moveContent") -> element1 children.
        # If element1 has one child `relatedItem`, and that has child `titleInfo`...
        # We need to reconstruct the "NodeInfo.name" style string for this chain.
        # source_def_list is list of children of <element1>. Usually just 1 top child.
        
        if not source_def_list: return
        
        # Helper to simulate NodeInfo generation for the config snippet
        def get_snippet_path_name(elements):
             # Deep traverse the first element until leaf
             # Java logic: `nodeInfosSource = ModsXmlHelper.createNodeInfo(null, moveContent.sourceNodeList);`
             # `innermostNode = nodeInfosSource.get(nodeInfosSource.size() - 1);`
             pass
        
        # Let's trust Java's logic: it matches based on `NodeInfo.name`.
        # So we generate NodeInfo for config snippet.
        # But config snippet is "detached" elements.
        
        # We need a root for the snippet to pass to XmlHelper?
        # We can wrap source_def_list in a dummy root?
        dummy = etree.Element("dummy")
        for e in source_def_list:
             # We need to deep copy because append moves it
             dummy.append(copy.deepcopy(e))
             
        # But wait, `get_node_path_name` relies on parents.
        # If we dump it in dummy, parent is dummy.
        # We need path starting from valid MODS path?
        # The config usually contains FULL path inside <mods> (implicit?).
        # Java: `moveContent` uses `modsXmlHelper` which excludes `mods` tag from path.
        # Example config: `<relatedItem type="host"><titleInfo><title>`
        # This matches `mods/relatedItem/titleInfo/title`.
        # So passing children of <element1> to dummy, calling get_all_nodes_info
        # will give us paths like "relatedItem ... | titleInfo ... | title".
        # We need the leaf one.
        
        source_infos = XmlHelper.get_all_nodes_info(dummy)
        if not source_infos: return
        source_innermost = source_infos[-1]
        
        # Now find this path in `root`
        input_nodes_info = XmlHelper.get_all_nodes_info(root)
        
        target_node = None
        for info in input_nodes_info:
            if info.name == source_innermost.name:
                target_node = info.node
                break
        
        if target_node is None or not target_node.text:
             return 

        content = target_node.text
        
        # Now find destination
        dummy_dest = etree.Element("dummy")
        for e in dest_def_list:
             dummy_dest.append(copy.deepcopy(e))
             
        dest_infos = XmlHelper.get_all_nodes_info(dummy_dest)
        if not dest_infos: return
        dest_innermost_name = dest_infos[-1].name
        
        # We need to insert this content at dest_innermost_name
        # Java `insertElement` logic:
        # Traverse destination path backwards. Find first part that exists in document.
        # Insert remainder.
        
        # We have dest_infos list which represents the FULL path chain.
        # Check from end: if `info.name` exists in root?
        
        # Input nodes map for fast lookup
        input_path_map = {n.name: n.node for n in input_nodes_info}
        
        insertion_point_node = None
        remainder_start_index = 0
        
        # dest_infos is ordered top-down (root to leaf).
        # We want to find the DEEPEST existing node.
        
        for i, info in enumerate(dest_infos):
             if info.name in input_path_map:
                  insertion_point_node = input_path_map[info.name]
                  remainder_start_index = i + 1
             else:
                  # This part doesn't exist, and subsequently children won't either
                  break
        
        parent = insertion_point_node
        if parent is None:
             parent = root # Start at root if nothing matches (top level element missing)
        
        # Construct remainder
        current_parent = parent
        
        # The elements in dest_infos are from dummy tree. We need to create NEW elements in input tree.
        # We effectively clone the structure from `dest_infos[remainder_start_index:]`.
        
        # But wait, `dest_infos` is flat list.
        # We need hierarchy.
        
        # If remainder is empty, it means leaf already exists. We update text.
        if remainder_start_index >= len(dest_infos):
             current_parent.text = content
        else:
             # We need to build the missing chain.
             # The `dest_infos` list contains NodeInfos. We can look at `info.node` to get tag/attribs.
             
             # The structure of `dest_infos` for `<A><B><C>` is [`A`, `A|B`, `A|B|C`]. (if traversing depth first)
             # We can't easily jump from `A` to `B` just by list index if there are siblings.
             # But here config is usually linear path.
             
             for i in range(remainder_start_index, len(dest_infos)):
                  info = dest_infos[i]
                  # Create element
                  # info.node is the element in dummy tree
                  new_elem = etree.Element(etree.QName(info.node).localname)
                  # Copy attribs
                  new_elem.attrib.update(info.node.attrib)
                  
                  current_parent.append(new_elem)
                  current_parent = new_elem
             
             # Set text on the last one
             current_parent.text = content

        # Clear text from source
        target_node.text = ""
        
        # Log structured move
        log_data["moves"].append({
            "source": source_innermost.name,
            "dest": dest_innermost_name,
            "label": dest_innermost_name.split(" | ")[-1],
            "value": content,
            "summary": f"Moved {source_innermost.name.split(' | ')[-1]} content to {dest_innermost_name.split(' | ')[-1]}"
        })


    def _remove_empty_parents(self, element):
         if element is None: return
         
         # Check if empty: no text (strip), no children
         has_text = element.text and element.text.strip()
         if not has_text and len(element) == 0:
              parent = element.getparent()
              if parent is not None:
                   parent.remove(element)
                   self._remove_empty_parents(parent)

import os