Spaces:
Sleeping
Sleeping
| import os | |
| import copy | |
| from lxml import etree | |
| from typing import Dict, List, Tuple, Set | |
| from utils import XmlHelper, DateInfo, NodeInfo, TAG_GENRE, TAG_ORIGIN_INFO, TAG_DATE_ISSUED, TAG_DATE_OTHER, ATTR_REPORTING_YEAR | |
| class ModsConverter: | |
| def __init__(self): | |
| self.move_config = {} | |
| def load_config(self, config_path: str): | |
| """Loads configuration for moving content.""" | |
| if not os.path.exists(config_path): | |
| return | |
| parser = etree.XMLParser(remove_blank_text=True) | |
| try: | |
| tree = etree.parse(config_path, parser) | |
| root = tree.getroot() | |
| for conversion in root.findall("pubTypeConversion"): | |
| p1 = conversion.find("pubType1") | |
| p2 = conversion.find("pubType2") | |
| if p1 is None or p2 is None: continue | |
| pt1 = p1.text | |
| pt2 = p2.text | |
| moves = [] | |
| for mc in conversion.findall("moveContent"): | |
| e1 = mc.find("element1") | |
| e2 = mc.find("element2") | |
| if e1 is not None and e2 is not None: | |
| # Store as list of elements to traverse matching path | |
| moves.append((list(e1), list(e2))) | |
| if moves: | |
| self.move_config[self._get_key(pt1, pt2)] = moves | |
| # Reverse | |
| self.move_config[self._get_key(pt2, pt1)] = [(m[1], m[0]) for m in moves] | |
| except Exception as e: | |
| print(f"Error loading config: {e}") | |
| def _get_key(self, p1, p2): | |
| if not p1 or not p2: return "" | |
| return f"{p1.lower()}{p2.lower()}" | |
| def convert(self, input_xml_str: str, template_xml_path: str) -> Tuple[str, dict]: | |
| """ | |
| Converts the input XML based on the template. | |
| Returns the result XML string and a structured log dictionary. | |
| """ | |
| log_data = { | |
| "old_genre": "", | |
| "new_genre": "", | |
| "moves": [], | |
| "additions": [], | |
| "deletions": [], | |
| "warnings": [] | |
| } | |
| # Parse inputs | |
| try: | |
| input_tree = XmlHelper.parse_xml(input_xml_str) | |
| input_root = input_tree.getroot() | |
| template_tree = XmlHelper.parse_xml(template_xml_path) | |
| template_root = template_tree.getroot() | |
| except ValueError as e: | |
| log_data["warnings"].append(f"Error parsing XML: {e}") | |
| return "", log_data | |
| # 1. Exchange Genre | |
| input_genre = XmlHelper.get_genre_node(input_root) | |
| template_genre = XmlHelper.get_genre_node(template_root) | |
| if input_genre is None or template_genre is None: | |
| log_data["warnings"].append("Missing genre element in input or template.") | |
| return "", log_data | |
| old_genre = input_genre.text | |
| new_genre = template_genre.text | |
| log_data["old_genre"] = old_genre | |
| log_data["new_genre"] = new_genre | |
| # Update genre text and attributes | |
| input_genre.text = new_genre | |
| input_genre.attrib.clear() | |
| input_genre.attrib.update(template_genre.attrib) | |
| # 2. Move Content | |
| key = self._get_key(old_genre, new_genre) | |
| if key in self.move_config: | |
| moves = self.move_config[key] | |
| for source_def, dest_def in moves: | |
| self._apply_move(input_root, source_def, dest_def, log_data) | |
| # 3. Delete Extra Content | |
| # We need a set of all valid paths from template | |
| template_nodes_info = XmlHelper.get_all_nodes_info(template_root) | |
| template_paths = {n.name for n in template_nodes_info} | |
| # Re-scan input nodes after move | |
| input_nodes_info = XmlHelper.get_all_nodes_info(input_root) | |
| # We iterate and remove. | |
| # Logic: If node path not in template, delete it. | |
| # "check which nodes of input file are not contained in template - and delete them" | |
| # "if parent node is empty now, delete it too" | |
| # We should iterate such that we don't try to access removed nodes. | |
| # But `input_nodes_info` creates a snapshot. | |
| # Checking `node.getparent()` will return None if already removed? | |
| # Actually lxml keeps parent ref even if removed from tree? No, `getparent()` returns None if removed. | |
| # We need to process this carefully. Java iterates the *snapshot* list. | |
| # "check if parent node still exists - because it could have been deleted in a step before" | |
| for node_info in input_nodes_info: | |
| # Skip empty names (like root if it resolved to empty) | |
| if not node_info.name: continue | |
| # Case insensitive check for paths | |
| template_paths_lower = [p.lower() for p in template_paths] | |
| is_in_template = node_info.name.lower() in template_paths_lower | |
| # Specific loose matching rules | |
| if not is_in_template: | |
| try: | |
| tag = etree.QName(node_info.node).localname.lower() | |
| # Rule 1: Allow 'affiliation' with any attributes if a bare 'affiliation' exists in template | |
| # AND parent path matches. | |
| if tag == 'affiliation': | |
| # Construct relaxed path: remove attributes from the last segment | |
| # Format: "parent | affiliation [type=group]" -> "parent | affiliation" | |
| last_sep = node_info.name.rfind(" | ") | |
| if last_sep != -1: | |
| parent_part = node_info.name[:last_sep] | |
| # We assume the parent path part is correct (since parent wasn't deleted if we are here... | |
| # well, actually we are iterating a snapshot, so parent MIGHT be deleted, | |
| # but we check parent is not none later on deletion) | |
| # Construct potential template path: parent + strict tag name | |
| # We use the tag name from the node, but stripped of attributes | |
| relaxed_candidate = f"{parent_part} | {etree.QName(node_info.node).localname}" | |
| if relaxed_candidate.lower() in template_paths_lower: | |
| is_in_template = True | |
| # Rule 2: Always preserve 'alternativeName' and its children if parent 'name' is preserved | |
| # (implied by parent path match, but we need to check if we are Inside an alternativeName tree) | |
| # Or just 'alternativeName' tag itself. | |
| # The path for children would be "name | alternativeName | namePart" | |
| # Check if current tag is alternativeName OR if any parent in path is alternativeName | |
| # node_info.name contains full path. | |
| if 'alternativeName' in node_info.name: | |
| # We need to be careful not to preserve it if the parent NAME itself was deleted? | |
| # But we are iterating inputs. Parents are processed? | |
| # Actually we iterate flat list. If parent was deleted, we might validly delete child. | |
| # But here we are deciding if we SHOULD delete. | |
| # If the path contains alternativeName, we check if the base path (up to name) is valid? | |
| # Simpler: If it's alternativeName or child of it, Assume preserved IF parent exists. | |
| # The loop logic "input_nodes_info" contains all nodes. | |
| # If we say `is_in_template = True`, we keep it. | |
| # If parent `name` was removed, then `alternativeName` would be removed automatically? | |
| # No, `parent.remove(node)` removes it from tree. | |
| # But we are iterating a snapshot. | |
| # `if parent is not None:` check handles if parent was already removed/detached? | |
| # Yes, if `name` was removed, `alternativeName.getparent()` (which is that name node) | |
| # is still that node object (it's consistent in lxml), BUT that name node is no longer in tree. | |
| # Wait, if `name` is removed from `mods`, `name.getparent()` might be None? | |
| # lxml: "When an element is removed from its parent, it is not destroyed... getparent() returns None" | |
| # So if parent `name` was removed in previous iteration, `parent` here will be None (or the name node, but name node's parent is None). | |
| # Actually `node.getparent()` returns the parent element. | |
| # If parent element was removed from ITS parent, `node.getparent()` still returns the parent element. | |
| # It's only if `node` was removed from `parent` that `getparent()` is None. | |
| # So we need to ensure we don't keep it if parent is "gone" effectively? | |
| # But the standard logic deletes children if parent is deleted? | |
| # "if parent node is empty now, delete it too" - that's post-deletion cleanup. | |
| # If we mark `alternativeName` as "in template" (preserved), we just DON'T delete it explicitly here. | |
| # If its parent `name` was deleted, then `alternativeName` effectively goes with it. | |
| # So we just need to say: "Don't delete alternativeName just because it's missing from template". | |
| is_in_template = True | |
| except: | |
| pass | |
| if not is_in_template: | |
| # Node isn't in template. | |
| node = node_info.node | |
| parent = node.getparent() | |
| if parent is not None: | |
| # Log if it has content | |
| text = node.text | |
| if text and text.strip() and not node_info.has_child_elements: | |
| label = node_info.name.split(" | ")[-1] | |
| log_data["deletions"].append({ | |
| "path": node_info.name, | |
| "label": label, | |
| "value": text.strip() | |
| }) | |
| # Remove | |
| parent.remove(node) | |
| # Remove empty parents | |
| self._remove_empty_parents(parent) | |
| # 4. Sync Template Defaults (Additions) | |
| # Anything in template that has text but is missing in input should be added | |
| input_nodes_info_final = XmlHelper.get_all_nodes_info(input_root) | |
| input_paths_final = {n.name.lower() for n in input_nodes_info_final} | |
| for t_info in template_nodes_info: | |
| if t_info.name.lower() not in input_paths_final: | |
| t_node = t_info.node | |
| # Only sync if it has actual text (default value) | |
| if t_node.text and t_node.text.strip(): | |
| # Construct the path chain from t_node to root | |
| path_elements = [] | |
| curr = t_node | |
| while curr is not None and curr != template_root: | |
| path_elements.insert(0, (curr.tag, curr.attrib)) | |
| curr = curr.getparent() | |
| if path_elements: | |
| # Find insertion point | |
| current_parent = input_root | |
| for tag, attrib in path_elements: | |
| match = None | |
| for child in current_parent: | |
| # Loose match for sync purposes | |
| if child.tag == tag: | |
| match = child | |
| break | |
| if match is not None: | |
| current_parent = match | |
| else: | |
| # Create new | |
| new_elem = etree.Element(tag) | |
| new_elem.attrib.update(attrib) | |
| current_parent.append(new_elem) | |
| current_parent = new_elem | |
| # Set text | |
| current_parent.text = t_node.text | |
| # Better label for addition | |
| label = t_info.name.split(" | ")[-1] | |
| log_data["additions"].append({ | |
| "path": t_info.name, | |
| "label": label, | |
| "value": t_node.text, | |
| "summary": f"Set default {label} to '{t_node.text}'" | |
| }) | |
| # Add to final paths to avoid duplicates if siblings match | |
| input_paths_final.add(t_info.name.lower()) | |
| # 5. Handle Dates | |
| try: | |
| date_info_input = XmlHelper.find_date_nodes(input_root) | |
| date_info_template = XmlHelper.find_date_nodes(template_root) | |
| if date_info_input.both_dates_in_same_block != date_info_template.both_dates_in_same_block: | |
| # We need nodes to manipulate. | |
| d_issued = date_info_input.date_issued_node | |
| d_reporting = date_info_input.reporting_year_node | |
| if d_issued is None or d_reporting is None: | |
| # Can't manipulate if missing | |
| pass | |
| elif date_info_input.both_dates_in_same_block: | |
| # Case 1: Currently same block -> Separate them | |
| # "create new origin info element and add as child the reporting year element" | |
| # "remove reporting year element from old origin info element" | |
| # Original origin info | |
| old_origin_info = d_reporting.getparent() | |
| # Create new originInfo | |
| # Where to add? Java: `document.getDocumentElement().appendChild(newOriginInfoNode)` -> To root (mods) | |
| new_origin_info = etree.Element(TAG_ORIGIN_INFO) | |
| input_root.append(new_origin_info) | |
| # Move reporting year | |
| # lxml move is just append to new parent (removes from old automatically) | |
| new_origin_info.append(d_reporting) | |
| else: # currently separate -> unite them | |
| # "add reporting year element to the origin info element containing the issue date" | |
| # "remove now empty origin info element which contained the reporting year element" | |
| target_origin_info = d_issued.getparent() | |
| old_host_origin_info = d_reporting.getparent() | |
| target_origin_info.append(d_reporting) | |
| # Remove old host if empty | |
| self._remove_empty_parents(old_host_origin_info) | |
| except ValueError as e: | |
| log_data["warnings"].append(f"Date processing warning: {e}") | |
| # Serialize | |
| return etree.tostring(input_root, encoding='unicode', pretty_print=True), log_data | |
| def _apply_move(self, root, source_def_list, dest_def_list, log_data): | |
| # source_def_list is list of Elements defining the structure to find content | |
| # We need to find the innermost element in source path in 'root' | |
| # 1. Construct path string match logic is hard with just Elements. | |
| # But we can find the node in 'root' that matches the path described by 'source_def_list' | |
| # Java `createNodeInfo` uses path names. | |
| # Effectively: find a node in root that has same path structure as source_def_list. | |
| # The 'source_def_list' comes from config xml <element1><child>...</child></element1> | |
| # Helper to get path name for the def list | |
| # It seems def list is just a chain of elements? | |
| # <element1><relatedItem type="host"><titleInfo><title/></titleInfo></relatedItem></element1> | |
| # The list from findall("moveContent") -> element1 children. | |
| # If element1 has one child `relatedItem`, and that has child `titleInfo`... | |
| # We need to reconstruct the "NodeInfo.name" style string for this chain. | |
| # source_def_list is list of children of <element1>. Usually just 1 top child. | |
| if not source_def_list: return | |
| # Helper to simulate NodeInfo generation for the config snippet | |
| def get_snippet_path_name(elements): | |
| # Deep traverse the first element until leaf | |
| # Java logic: `nodeInfosSource = ModsXmlHelper.createNodeInfo(null, moveContent.sourceNodeList);` | |
| # `innermostNode = nodeInfosSource.get(nodeInfosSource.size() - 1);` | |
| pass | |
| # Let's trust Java's logic: it matches based on `NodeInfo.name`. | |
| # So we generate NodeInfo for config snippet. | |
| # But config snippet is "detached" elements. | |
| # We need a root for the snippet to pass to XmlHelper? | |
| # We can wrap source_def_list in a dummy root? | |
| dummy = etree.Element("dummy") | |
| for e in source_def_list: | |
| # We need to deep copy because append moves it | |
| dummy.append(copy.deepcopy(e)) | |
| # But wait, `get_node_path_name` relies on parents. | |
| # If we dump it in dummy, parent is dummy. | |
| # We need path starting from valid MODS path? | |
| # The config usually contains FULL path inside <mods> (implicit?). | |
| # Java: `moveContent` uses `modsXmlHelper` which excludes `mods` tag from path. | |
| # Example config: `<relatedItem type="host"><titleInfo><title>` | |
| # This matches `mods/relatedItem/titleInfo/title`. | |
| # So passing children of <element1> to dummy, calling get_all_nodes_info | |
| # will give us paths like "relatedItem ... | titleInfo ... | title". | |
| # We need the leaf one. | |
| source_infos = XmlHelper.get_all_nodes_info(dummy) | |
| if not source_infos: return | |
| source_innermost = source_infos[-1] | |
| # Now find this path in `root` | |
| input_nodes_info = XmlHelper.get_all_nodes_info(root) | |
| target_node = None | |
| for info in input_nodes_info: | |
| if info.name == source_innermost.name: | |
| target_node = info.node | |
| break | |
| if target_node is None or not target_node.text: | |
| return | |
| content = target_node.text | |
| # Now find destination | |
| dummy_dest = etree.Element("dummy") | |
| for e in dest_def_list: | |
| dummy_dest.append(copy.deepcopy(e)) | |
| dest_infos = XmlHelper.get_all_nodes_info(dummy_dest) | |
| if not dest_infos: return | |
| dest_innermost_name = dest_infos[-1].name | |
| # We need to insert this content at dest_innermost_name | |
| # Java `insertElement` logic: | |
| # Traverse destination path backwards. Find first part that exists in document. | |
| # Insert remainder. | |
| # We have dest_infos list which represents the FULL path chain. | |
| # Check from end: if `info.name` exists in root? | |
| # Input nodes map for fast lookup | |
| input_path_map = {n.name: n.node for n in input_nodes_info} | |
| insertion_point_node = None | |
| remainder_start_index = 0 | |
| # dest_infos is ordered top-down (root to leaf). | |
| # We want to find the DEEPEST existing node. | |
| for i, info in enumerate(dest_infos): | |
| if info.name in input_path_map: | |
| insertion_point_node = input_path_map[info.name] | |
| remainder_start_index = i + 1 | |
| else: | |
| # This part doesn't exist, and subsequently children won't either | |
| break | |
| parent = insertion_point_node | |
| if parent is None: | |
| parent = root # Start at root if nothing matches (top level element missing) | |
| # Construct remainder | |
| current_parent = parent | |
| # The elements in dest_infos are from dummy tree. We need to create NEW elements in input tree. | |
| # We effectively clone the structure from `dest_infos[remainder_start_index:]`. | |
| # But wait, `dest_infos` is flat list. | |
| # We need hierarchy. | |
| # If remainder is empty, it means leaf already exists. We update text. | |
| if remainder_start_index >= len(dest_infos): | |
| current_parent.text = content | |
| else: | |
| # We need to build the missing chain. | |
| # The `dest_infos` list contains NodeInfos. We can look at `info.node` to get tag/attribs. | |
| # The structure of `dest_infos` for `<A><B><C>` is [`A`, `A|B`, `A|B|C`]. (if traversing depth first) | |
| # We can't easily jump from `A` to `B` just by list index if there are siblings. | |
| # But here config is usually linear path. | |
| for i in range(remainder_start_index, len(dest_infos)): | |
| info = dest_infos[i] | |
| # Create element | |
| # info.node is the element in dummy tree | |
| new_elem = etree.Element(etree.QName(info.node).localname) | |
| # Copy attribs | |
| new_elem.attrib.update(info.node.attrib) | |
| current_parent.append(new_elem) | |
| current_parent = new_elem | |
| # Set text on the last one | |
| current_parent.text = content | |
| # Clear text from source | |
| target_node.text = "" | |
| # Log structured move | |
| log_data["moves"].append({ | |
| "source": source_innermost.name, | |
| "dest": dest_innermost_name, | |
| "label": dest_innermost_name.split(" | ")[-1], | |
| "value": content, | |
| "summary": f"Moved {source_innermost.name.split(' | ')[-1]} content to {dest_innermost_name.split(' | ')[-1]}" | |
| }) | |
| def _remove_empty_parents(self, element): | |
| if element is None: return | |
| # Check if empty: no text (strip), no children | |
| has_text = element.text and element.text.strip() | |
| if not has_text and len(element) == 0: | |
| parent = element.getparent() | |
| if parent is not None: | |
| parent.remove(element) | |
| self._remove_empty_parents(parent) | |
| import os | |