Spaces:

andrehoffmann80
/

dora_pubtype

Sleeping

App Files Files Community

andrehoffmann80 commited on Feb 14

Commit

9336543

verified ·

1 Parent(s): 345393f

Upload 4 files

Browse files

Files changed (4) hide show

app.py +346 -0
converter.py +484 -0
requirements.txt +3 -3
utils.py +161 -0

app.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import streamlit as st
+import os
+import io
+import requests
+from lxml import etree
+from converter import ModsConverter
+from urllib.parse import quote, unquote
+st.set_page_config(
+    page_title="PubTypeConverter | DORA Tools",
+    page_icon="🔄",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
+# Custom CSS for a modern, premium look
+st.markdown("""
+<style>
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');
+    html, body, [class*="css"] {
+        font-family: 'Inter', sans-serif;
+    }
+    .stApp {
+        background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
+    }
+    /* Premium Header */
+    .title-container {
+        padding: 2rem 0;
+        text-align: center;
+        background: rgba(255, 255, 255, 0.4);
+        backdrop-filter: blur(10px);
+        border-radius: 20px;
+        margin-bottom: 2rem;
+        border: 1px solid rgba(255, 255, 255, 0.5);
+        box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.07);
+    }
+    .main-title {
+        font-size: 3rem;
+        font-weight: 700;
+        background: linear-gradient(90deg, #1e3a8a, #3b82f6);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        margin-bottom: 0.5rem;
+    }
+    .sub-title {
+        color: #64748b;
+        font-size: 1.1rem;
+    }
+    /* Section Styling */
+    .stSelectbox, .stTextInput, .stButton {
+        margin-bottom: 1rem;
+    }
+    /* Card-like containers for results */
+    .result-card {
+        background: white;
+        padding: 1.5rem;
+        border-radius: 15px;
+        box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+        margin-bottom: 1.5rem;
+        border-left: 5px solid #3b82f6;
+    }
+    /* Step indicators */
+    .step-header {
+        font-weight: 600;
+        color: #1e293b;
+        margin-bottom: 1rem;
+        display: flex;
+        align-items: center;
+        gap: 0.5rem;
+    }
+    .step-number {
+        background: #3b82f6;
+        color: white;
+        width: 24px;
+        height: 24px;
+        border-radius: 50%;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        font-size: 0.8rem;
+    }
+</style>
+<div class="title-container">
+    <div class="main-title">DORA PubTypeConverter</div>
+    <div class="sub-title">DORA Publication Type Transformation Helper</div>
+</div>
+""", unsafe_allow_html=True)
+# Setup paths
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+RESOURCE_DIR = os.path.join(BASE_DIR, "PubTypeConverter_resources")
+TEMPLATE_DIR = os.path.join(RESOURCE_DIR, "PubTypeConverter_templates")
+CONFIG_FILE = os.path.join(RESOURCE_DIR, "PubTypeConverterConfig.xml")
+# Initialize Session State
+if 'converter' not in st.session_state:
+    converter = ModsConverter()
+    if os.path.exists(CONFIG_FILE):
+        converter.load_config(CONFIG_FILE)
+    else:
+        st.warning(f"Configuration file not found at {CONFIG_FILE}. Content moving rules will not be applied.")
+    st.session_state.converter = converter
+if 'loaded_files' not in st.session_state:
+    st.session_state.loaded_files = [] # List of dicts: {'name': str, 'content': bytes}
+# Load templates
+templates = []
+if os.path.exists(TEMPLATE_DIR):
+    for f in os.listdir(TEMPLATE_DIR):
+        if f.lower().endswith(".xml"):
+            templates.append(f)
+else:
+    st.error(f"Template directory not found: {TEMPLATE_DIR}")
+templates.sort()
+# Helper function to fetch from DORA
+def fetch_from_dora(pid_or_url, repo):
+    url = pid_or_url.strip()
+    # Handle double-encoded PIDs (e.g. psi%253A84411 -> psi%3A84411 -> psi:84411)
+    # We unquote until the string stabilizes
+    prev = None
+    while url != prev:
+        prev = url
+        url = unquote(url)
+    fallback_www = None
+    fallback_oai = None
+    if not url.startswith("http"):
+        # Construct URL from PID
+        # Assume PID format like "psi:12345" or just "12345"
+        if ":" in url:
+            parts = url.split(":")
+            repo_prefix = parts[0].lower()
+            # Force prefix to lowercase for repo mapping
+            pid_val = f"{repo_prefix}:{parts[1]}"
+            repo = repo_prefix
+        else:
+             # Use selected repo
+             pid_val = f"{repo}:{url}"
+        # URL Encode the PID part - essential for Islandora
+        quoted_pid = quote(pid_val)
+        # Primary: Admin (Intranet, preferred)
+        url = f"https://admin.dora.lib4ri.ch/{repo}/islandora/object/{quoted_pid}/datastream/MODS/view"
+        # Fallback 1: WWW (Public mirror)
+        fallback_www = f"https://www.dora.lib4ri.ch/{repo}/islandora/object/{quoted_pid}/datastream/MODS/view"
+        # Fallback 2: OAI-PMH (Robust Public Access)
+        # Use quoted PID for identifier to be safe (suggested by user example)
+        fallback_oai = f"https://www.dora.lib4ri.ch/{repo}/oai2/request?verb=GetRecord&metadataPrefix=mods&identifier={quoted_pid}"
+    headers = {'User-Agent': 'curl/7.68.0'}
+    # helper to check OAI response for valid content
+    def check_oai_response(content):
+        try:
+             root = etree.fromstring(content)
+             # Check for OAI error code
+             if root.xpath(".//*[local-name()='error']"):
+                 return False
+             # Ultimate robust search: find first 'mods' element regardless of namespace/prefix
+             mods_nodes = root.xpath(".//*[local-name()='mods']")
+             if mods_nodes:
+                 return etree.tostring(mods_nodes[0], encoding='utf-8')
+        except:
+             pass
+        return None
+    try:
+        # Try primary URL
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        return response.content, url
+    except Exception as e:
+        status_errors = [f"Admin: {e}"]
+        # Try Fallback 1: WWW
+        if fallback_www:
+            try:
+                response = requests.get(fallback_www, headers=headers, timeout=10)
+                response.raise_for_status()
+                return response.content, fallback_www
+            except Exception as e2:
+                status_errors.append(f"WWW: {e2}")
+        # Try Fallback 2: OAI-PMH
+        if fallback_oai:
+             try:
+                response = requests.get(fallback_oai, headers=headers, timeout=10)
+                if response.status_code == 200:
+                    mods_content = check_oai_response(response.content)
+                    if mods_content:
+                        return mods_content, fallback_oai
+                    else:
+                        status_errors.append("OAI: Valid HTTP but no MODS found in response")
+                else:
+                    status_errors.append(f"OAI: HTTP {response.status_code}")
+             except Exception as e3:
+                status_errors.append(f"OAI: {e3}")
+        return None, f"Failed to fetch. Details: {'; '.join(status_errors)}"
+# UI Layout
+main_col1, main_col2 = st.columns([0.6, 0.4], gap="large")
+with main_col1:
+    st.markdown('<div class="step-header"><div class="step-number">1</div><span>Select Source Data</span></div>', unsafe_allow_html=True)
+    input_tab1, input_tab2 = st.tabs(["🌐 Pull from DORA", "📁 Upload Local XML"])
+    with input_tab1:
+        dora_col1, dora_col2 = st.columns([0.7, 0.3])
+        dora_input = dora_col1.text_input("PID or URL", placeholder="e.g. psi:84411", label_visibility="collapsed")
+        repo_select = dora_col2.selectbox("Repo", ["psi", "eawag", "empa", "wsl"], label_visibility="collapsed")
+        if st.button("Fetch and Load Record", use_container_width=True):
+            if dora_input:
+                with st.spinner("Retrieving from DORA..."):
+                    content, error_or_url = fetch_from_dora(dora_input, repo_select)
+                    if content:
+                        filename = dora_input.replace(":", "_").replace("/", "_") + ".xml"
+                        if filename.startswith("http"): filename = "dora_record.xml"
+                        st.session_state.loaded_files.append({"name": filename, "content": content, "source": error_or_url})
+                        st.toast(f"Loaded {filename}", icon="✅")
+                    else:
+                        st.error(f"Fetch failed: {error_or_url}")
+            else:
+                st.warning("Please provide a identifier first.")
+    with input_tab2:
+        uploaded_files = st.file_uploader("Upload MODS XML files", type=['xml'], accept_multiple_files=True, label_visibility="collapsed")
+    # Display loaded files in a modern list
+    if st.session_state.loaded_files:
+        st.markdown("### Loaded Documents")
+        for i, file_data in enumerate(st.session_state.loaded_files):
+            with st.container():
+                f_col1, f_col2 = st.columns([0.85, 0.15])
+                f_col1.markdown(f"📄 **{file_data['name']}**")
+                if f_col2.button("🗑️", key=f"remove_{i}", help="Remove this file"):
+                    st.session_state.loaded_files.pop(i)
+                    st.rerun()
+with main_col2:
+    st.markdown('<div class="step-header"><div class="step-number">2</div><span>Target Format</span></div>', unsafe_allow_html=True)
+    selected_template = st.selectbox("Choose the destination publication type", templates if templates else ["No templates found"], label_visibility="collapsed")
+    st.markdown("---")
+    if st.button("🚀 Start Conversion", disabled=not (uploaded_files or st.session_state.loaded_files) or not templates, use_container_width=True, type="primary"):
+        st.session_state.start_convert = True
+    else:
+        st.session_state.start_convert = False
+# Combine sources
+all_files = []
+if uploaded_files:
+    for f in uploaded_files:
+        all_files.append({"name": f.name, "content": f.getvalue()})
+if st.session_state.loaded_files:
+    all_files.extend(st.session_state.loaded_files)
+if st.session_state.get("start_convert"):
+    st.markdown('<div class="step-header"><div class="step-number">3</div><span>Conversion Reports</span></div>', unsafe_allow_html=True)
+    for file_data in all_files:
+        content = file_data['content']
+        filename = file_data['name']
+        # Try to decode if bytes, though lxml can parse bytes directly
+        # But we pass string/bytes/path to converter
+        template_path = os.path.join(TEMPLATE_DIR, selected_template)
+        try:
+            # result_xml is the XML string, log is the structured data dict
+            result_xml, log_data = st.session_state.converter.convert(content, template_path)
+            with st.container():
+                st.markdown(f"""
+                <div class="result-card">
+                    <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1rem;">
+                        <span style="font-weight: 700; font-size: 1.2rem; color: #1e3a8a;">{filename}</span>
+                        <span style="background: #dbeafe; color: #1e40af; padding: 0.2rem 0.8rem; border-radius: 999px; font-size: 0.8rem; font-weight: 600;">TRANSFORMED</span>
+                    </div>
+                    <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 1.5rem;">
+                         <span style="background: #f1f5f9; padding: 0.4rem 1rem; border-radius: 8px; font-size: 0.9rem; border: 1px solid #e2e8f0;">{log_data['old_genre']}</span>
+                         <span style="color: #94a3b8;">➡️</span>
+                         <span style="background: #ecfdf5; color: #065f46; padding: 0.4rem 1rem; border-radius: 8px; font-size: 0.9rem; border: 1px solid #d1fae5; font-weight: 600;">{log_data['new_genre']}</span>
+                    </div>
+                """, unsafe_allow_html=True)
+                s_col1, s_col2, s_col3 = st.columns(3)
+                s_col1.metric("Transfers", len(log_data["moves"]))
+                s_col2.metric("Additions", len(log_data["additions"]))
+                s_col3.metric("Removals", len(log_data["deletions"]))
+                with st.expander("Audit Transformation Details", expanded=False):
+                    if log_data["moves"]:
+                        st.write("**🔄 Content Transfers**")
+                        for m in log_data["moves"]:
+                            st.caption(f"• {m['summary']}")
+                    if log_data["additions"]:
+                        st.write("**✨ Smart Additions**")
+                        for a in log_data["additions"]:
+                            st.caption(f"• {a['summary']}")
+                    if log_data["deletions"]:
+                        st.write("**🗑️ Legacy Cleanup**")
+                        del_labels = [d['label'] for d in log_data["deletions"]]
+                        st.caption(f"Removed {len(del_labels)} unused fields: " + ", ".join(del_labels[:8]) + ("..." if len(del_labels) > 8 else ""))
+                if log_data["warnings"]:
+                    for w in log_data["warnings"]:
+                        st.warning(w)
+                if result_xml:
+                    st.download_button(
+                        label=f"⬇️ Download {filename}",
+                        data=result_xml,
+                        file_name=f"{os.path.splitext(filename)[0]}_converted.xml",
+                        mime="application/xml",
+                        key=f"dl_{filename}",
+                        use_container_width=True
+                    )
+                st.markdown('</div>', unsafe_allow_html=True)
+        except Exception as e:
+            st.error(f"Error converting {filename}: {e}")
+            import traceback
+            st.code(traceback.format_exc())

converter.py ADDED Viewed

	@@ -0,0 +1,484 @@

+import os
+import copy
+from lxml import etree
+from typing import Dict, List, Tuple, Set
+from utils import XmlHelper, DateInfo, NodeInfo, TAG_GENRE, TAG_ORIGIN_INFO, TAG_DATE_ISSUED, TAG_DATE_OTHER, ATTR_REPORTING_YEAR
+class ModsConverter:
+    def __init__(self):
+        self.move_config = {}
+    def load_config(self, config_path: str):
+        """Loads configuration for moving content."""
+        if not os.path.exists(config_path):
+            return
+        parser = etree.XMLParser(remove_blank_text=True)
+        try:
+            tree = etree.parse(config_path, parser)
+            root = tree.getroot()
+            for conversion in root.findall("pubTypeConversion"):
+                p1 = conversion.find("pubType1")
+                p2 = conversion.find("pubType2")
+                if p1 is None or p2 is None: continue
+                pt1 = p1.text
+                pt2 = p2.text
+                moves = []
+                for mc in conversion.findall("moveContent"):
+                    e1 = mc.find("element1")
+                    e2 = mc.find("element2")
+                    if e1 is not None and e2 is not None:
+                         # Store as list of elements to traverse matching path
+                         moves.append((list(e1), list(e2)))
+                if moves:
+                    self.move_config[self._get_key(pt1, pt2)] = moves
+                    # Reverse
+                    self.move_config[self._get_key(pt2, pt1)] = [(m[1], m[0]) for m in moves]
+        except Exception as e:
+            print(f"Error loading config: {e}")
+    def _get_key(self, p1, p2):
+        if not p1 or not p2: return ""
+        return f"{p1.lower()}{p2.lower()}"
+    def convert(self, input_xml_str: str, template_xml_path: str) -> Tuple[str, dict]:
+        """
+        Converts the input XML based on the template.
+        Returns the result XML string and a structured log dictionary.
+        """
+        log_data = {
+            "old_genre": "",
+            "new_genre": "",
+            "moves": [],
+            "additions": [],
+            "deletions": [],
+            "warnings": []
+        }
+        # Parse inputs
+        try:
+            input_tree = XmlHelper.parse_xml(input_xml_str)
+            input_root = input_tree.getroot()
+            template_tree = XmlHelper.parse_xml(template_xml_path)
+            template_root = template_tree.getroot()
+        except ValueError as e:
+            log_data["warnings"].append(f"Error parsing XML: {e}")
+            return "", log_data
+        # 1. Exchange Genre
+        input_genre = XmlHelper.get_genre_node(input_root)
+        template_genre = XmlHelper.get_genre_node(template_root)
+        if input_genre is None or template_genre is None:
+            log_data["warnings"].append("Missing genre element in input or template.")
+            return "", log_data
+        old_genre = input_genre.text
+        new_genre = template_genre.text
+        log_data["old_genre"] = old_genre
+        log_data["new_genre"] = new_genre
+        # Update genre text and attributes
+        input_genre.text = new_genre
+        input_genre.attrib.clear()
+        input_genre.attrib.update(template_genre.attrib)
+        # 2. Move Content
+        key = self._get_key(old_genre, new_genre)
+        if key in self.move_config:
+            moves = self.move_config[key]
+            for source_def, dest_def in moves:
+                self._apply_move(input_root, source_def, dest_def, log_data)
+        # 3. Delete Extra Content
+        # We need a set of all valid paths from template
+        template_nodes_info = XmlHelper.get_all_nodes_info(template_root)
+        template_paths = {n.name for n in template_nodes_info}
+        # Re-scan input nodes after move
+        input_nodes_info = XmlHelper.get_all_nodes_info(input_root)
+        # We iterate and remove.
+        # Logic: If node path not in template, delete it.
+        # "check which nodes of input file are not contained in template - and delete them"
+        # "if parent node is empty now, delete it too"
+        # We should iterate such that we don't try to access removed nodes.
+        # But `input_nodes_info` creates a snapshot.
+        # Checking `node.getparent()` will return None if already removed?
+        # Actually lxml keeps parent ref even if removed from tree? No, `getparent()` returns None if removed.
+        # We need to process this carefully. Java iterates the *snapshot* list.
+        # "check if parent node still exists - because it could have been deleted in a step before"
+        for node_info in input_nodes_info:
+             # Skip empty names (like root if it resolved to empty)
+             if not node_info.name: continue
+             # Case insensitive check for paths
+             template_paths_lower = [p.lower() for p in template_paths]
+             is_in_template = node_info.name.lower() in template_paths_lower
+             # Specific loose matching rules
+             if not is_in_template:
+                 try:
+                     tag = etree.QName(node_info.node).localname.lower()
+                     # Rule 1: Allow 'affiliation' with any attributes if a bare 'affiliation' exists in template
+                     # AND parent path matches.
+                     if tag == 'affiliation':
+                         # Construct relaxed path: remove attributes from the last segment
+                         # Format: "parent | affiliation [type=group]" -> "parent | affiliation"
+                         last_sep = node_info.name.rfind(" | ")
+                         if last_sep != -1:
+                             parent_part = node_info.name[:last_sep]
+                             # We assume the parent path part is correct (since parent wasn't deleted if we are here...
+                             # well, actually we are iterating a snapshot, so parent MIGHT be deleted,
+                             # but we check parent is not none later on deletion)
+                             # Construct potential template path: parent + strict tag name
+                             # We use the tag name from the node, but stripped of attributes
+                             relaxed_candidate = f"{parent_part} | {etree.QName(node_info.node).localname}"
+                             if relaxed_candidate.lower() in template_paths_lower:
+                                 is_in_template = True
+                     # Rule 2: Always preserve 'alternativeName' and its children if parent 'name' is preserved
+                     # (implied by parent path match, but we need to check if we are Inside an alternativeName tree)
+                     # Or just 'alternativeName' tag itself.
+                     # The path for children would be "name | alternativeName | namePart"
+                     # Check if current tag is alternativeName OR if any parent in path is alternativeName
+                     # node_info.name contains full path.
+                     if 'alternativeName' in node_info.name:
+                         # We need to be careful not to preserve it if the parent NAME itself was deleted?
+                         # But we are iterating inputs. Parents are processed?
+                         # Actually we iterate flat list. If parent was deleted, we might validly delete child.
+                         # But here we are deciding if we SHOULD delete.
+                         # If the path contains alternativeName, we check if the base path (up to name) is valid?
+                         # Simpler: If it's alternativeName or child of it, Assume preserved IF parent exists.
+                         # The loop logic "input_nodes_info" contains all nodes.
+                         # If we say `is_in_template = True`, we keep it.
+                         # If parent `name` was removed, then `alternativeName` would be removed automatically?
+                         # No, `parent.remove(node)` removes it from tree.
+                         # But we are iterating a snapshot.
+                         # `if parent is not None:` check handles if parent was already removed/detached?
+                         # Yes, if `name` was removed, `alternativeName.getparent()` (which is that name node)
+                         # is still that node object (it's consistent in lxml), BUT that name node is no longer in tree.
+                         # Wait, if `name` is removed from `mods`, `name.getparent()` might be None?
+                         # lxml: "When an element is removed from its parent, it is not destroyed... getparent() returns None"
+                         # So if parent `name` was removed in previous iteration, `parent` here will be None (or the name node, but name node's parent is None).
+                         # Actually `node.getparent()` returns the parent element.
+                         # If parent element was removed from ITS parent, `node.getparent()` still returns the parent element.
+                         # It's only if `node` was removed from `parent` that `getparent()` is None.
+                         # So we need to ensure we don't keep it if parent is "gone" effectively?
+                         # But the standard logic deletes children if parent is deleted?
+                         # "if parent node is empty now, delete it too" - that's post-deletion cleanup.
+                         # If we mark `alternativeName` as "in template" (preserved), we just DON'T delete it explicitly here.
+                         # If its parent `name` was deleted, then `alternativeName` effectively goes with it.
+                         # So we just need to say: "Don't delete alternativeName just because it's missing from template".
+                         is_in_template = True
+                 except:
+                     pass
+             if not is_in_template:
+                 # Node isn't in template.
+                 node = node_info.node
+                 parent = node.getparent()
+                 if parent is not None:
+                     # Log if it has content
+                     text = node.text
+                     if text and text.strip() and not node_info.has_child_elements:
+                          label = node_info.name.split(" | ")[-1]
+                          log_data["deletions"].append({
+                              "path": node_info.name,
+                              "label": label,
+                              "value": text.strip()
+                          })
+                     # Remove
+                     parent.remove(node)
+                     # Remove empty parents
+                     self._remove_empty_parents(parent)
+        # 4. Sync Template Defaults (Additions)
+        # Anything in template that has text but is missing in input should be added
+        input_nodes_info_final = XmlHelper.get_all_nodes_info(input_root)
+        input_paths_final = {n.name.lower() for n in input_nodes_info_final}
+        for t_info in template_nodes_info:
+            if t_info.name.lower() not in input_paths_final:
+                t_node = t_info.node
+                # Only sync if it has actual text (default value)
+                if t_node.text and t_node.text.strip():
+                    # Construct the path chain from t_node to root
+                    path_elements = []
+                    curr = t_node
+                    while curr is not None and curr != template_root:
+                        path_elements.insert(0, (curr.tag, curr.attrib))
+                        curr = curr.getparent()
+                    if path_elements:
+                        # Find insertion point
+                        current_parent = input_root
+                        for tag, attrib in path_elements:
+                            match = None
+                            for child in current_parent:
+                                # Loose match for sync purposes
+                                if child.tag == tag:
+                                    match = child
+                                    break
+                            if match is not None:
+                                current_parent = match
+                            else:
+                                # Create new
+                                new_elem = etree.Element(tag)
+                                new_elem.attrib.update(attrib)
+                                current_parent.append(new_elem)
+                                current_parent = new_elem
+                        # Set text
+                        current_parent.text = t_node.text
+                        # Better label for addition
+                        label = t_info.name.split(" | ")[-1]
+                        log_data["additions"].append({
+                            "path": t_info.name,
+                            "label": label,
+                            "value": t_node.text,
+                            "summary": f"Set default {label} to '{t_node.text}'"
+                        })
+                        # Add to final paths to avoid duplicates if siblings match
+                        input_paths_final.add(t_info.name.lower())
+        # 5. Handle Dates
+        try:
+            date_info_input = XmlHelper.find_date_nodes(input_root)
+            date_info_template = XmlHelper.find_date_nodes(template_root)
+            if date_info_input.both_dates_in_same_block != date_info_template.both_dates_in_same_block:
+                # We need nodes to manipulate.
+                d_issued = date_info_input.date_issued_node
+                d_reporting = date_info_input.reporting_year_node
+                if d_issued is None or d_reporting is None:
+                    # Can't manipulate if missing
+                     pass
+                elif date_info_input.both_dates_in_same_block:
+                    # Case 1: Currently same block -> Separate them
+                    # "create new origin info element and add as child the reporting year element"
+                    # "remove reporting year element from old origin info element"
+                    # Original origin info
+                    old_origin_info = d_reporting.getparent()
+                    # Create new originInfo
+                    # Where to add? Java: `document.getDocumentElement().appendChild(newOriginInfoNode)` -> To root (mods)
+                    new_origin_info = etree.Element(TAG_ORIGIN_INFO)
+                    input_root.append(new_origin_info)
+                    # Move reporting year
+                    # lxml move is just append to new parent (removes from old automatically)
+                    new_origin_info.append(d_reporting)
+                else: # currently separate -> unite them
+                    # "add reporting year element to the origin info element containing the issue date"
+                    # "remove now empty origin info element which contained the reporting year element"
+                    target_origin_info = d_issued.getparent()
+                    old_host_origin_info = d_reporting.getparent()
+                    target_origin_info.append(d_reporting)
+                    # Remove old host if empty
+                    self._remove_empty_parents(old_host_origin_info)
+        except ValueError as e:
+            log_data["warnings"].append(f"Date processing warning: {e}")
+        # Serialize
+        return etree.tostring(input_root, encoding='unicode', pretty_print=True), log_data
+    def _apply_move(self, root, source_def_list, dest_def_list, log_data):
+        # source_def_list is list of Elements defining the structure to find content
+        # We need to find the innermost element in source path in 'root'
+        # 1. Construct path string match logic is hard with just Elements.
+        # But we can find the node in 'root' that matches the path described by 'source_def_list'
+        # Java `createNodeInfo` uses path names.
+        # Effectively: find a node in root that has same path structure as source_def_list.
+        # The 'source_def_list' comes from config xml <element1><child>...</child></element1>
+        # Helper to get path name for the def list
+        # It seems def list is just a chain of elements?
+        # <element1><relatedItem type="host"><titleInfo><title/></titleInfo></relatedItem></element1>
+        # The list from findall("moveContent") -> element1 children.
+        # If element1 has one child `relatedItem`, and that has child `titleInfo`...
+        # We need to reconstruct the "NodeInfo.name" style string for this chain.
+        # source_def_list is list of children of <element1>. Usually just 1 top child.
+        if not source_def_list: return
+        # Helper to simulate NodeInfo generation for the config snippet
+        def get_snippet_path_name(elements):
+             # Deep traverse the first element until leaf
+             # Java logic: `nodeInfosSource = ModsXmlHelper.createNodeInfo(null, moveContent.sourceNodeList);`
+             # `innermostNode = nodeInfosSource.get(nodeInfosSource.size() - 1);`
+             pass
+        # Let's trust Java's logic: it matches based on `NodeInfo.name`.
+        # So we generate NodeInfo for config snippet.
+        # But config snippet is "detached" elements.
+        # We need a root for the snippet to pass to XmlHelper?
+        # We can wrap source_def_list in a dummy root?
+        dummy = etree.Element("dummy")
+        for e in source_def_list:
+             # We need to deep copy because append moves it
+             dummy.append(copy.deepcopy(e))
+        # But wait, `get_node_path_name` relies on parents.
+        # If we dump it in dummy, parent is dummy.
+        # We need path starting from valid MODS path?
+        # The config usually contains FULL path inside <mods> (implicit?).
+        # Java: `moveContent` uses `modsXmlHelper` which excludes `mods` tag from path.
+        # Example config: `<relatedItem type="host"><titleInfo><title>`
+        # This matches `mods/relatedItem/titleInfo/title`.
+        # So passing children of <element1> to dummy, calling get_all_nodes_info
+        # will give us paths like "relatedItem ... | titleInfo ... | title".
+        # We need the leaf one.
+        source_infos = XmlHelper.get_all_nodes_info(dummy)
+        if not source_infos: return
+        source_innermost = source_infos[-1]
+        # Now find this path in `root`
+        input_nodes_info = XmlHelper.get_all_nodes_info(root)
+        target_node = None
+        for info in input_nodes_info:
+            if info.name == source_innermost.name:
+                target_node = info.node
+                break
+        if target_node is None or not target_node.text:
+             return
+        content = target_node.text
+        # Now find destination
+        dummy_dest = etree.Element("dummy")
+        for e in dest_def_list:
+             dummy_dest.append(copy.deepcopy(e))
+        dest_infos = XmlHelper.get_all_nodes_info(dummy_dest)
+        if not dest_infos: return
+        dest_innermost_name = dest_infos[-1].name
+        # We need to insert this content at dest_innermost_name
+        # Java `insertElement` logic:
+        # Traverse destination path backwards. Find first part that exists in document.
+        # Insert remainder.
+        # We have dest_infos list which represents the FULL path chain.
+        # Check from end: if `info.name` exists in root?
+        # Input nodes map for fast lookup
+        input_path_map = {n.name: n.node for n in input_nodes_info}
+        insertion_point_node = None
+        remainder_start_index = 0
+        # dest_infos is ordered top-down (root to leaf).
+        # We want to find the DEEPEST existing node.
+        for i, info in enumerate(dest_infos):
+             if info.name in input_path_map:
+                  insertion_point_node = input_path_map[info.name]
+                  remainder_start_index = i + 1
+             else:
+                  # This part doesn't exist, and subsequently children won't either
+                  break
+        parent = insertion_point_node
+        if parent is None:
+             parent = root # Start at root if nothing matches (top level element missing)
+        # Construct remainder
+        current_parent = parent
+        # The elements in dest_infos are from dummy tree. We need to create NEW elements in input tree.
+        # We effectively clone the structure from `dest_infos[remainder_start_index:]`.
+        # But wait, `dest_infos` is flat list.
+        # We need hierarchy.
+        # If remainder is empty, it means leaf already exists. We update text.
+        if remainder_start_index >= len(dest_infos):
+             current_parent.text = content
+        else:
+             # We need to build the missing chain.
+             # The `dest_infos` list contains NodeInfos. We can look at `info.node` to get tag/attribs.
+             # The structure of `dest_infos` for `<A><B><C>` is [`A`, `A|B`, `A|B|C`]. (if traversing depth first)
+             # We can't easily jump from `A` to `B` just by list index if there are siblings.
+             # But here config is usually linear path.
+             for i in range(remainder_start_index, len(dest_infos)):
+                  info = dest_infos[i]
+                  # Create element
+                  # info.node is the element in dummy tree
+                  new_elem = etree.Element(etree.QName(info.node).localname)
+                  # Copy attribs
+                  new_elem.attrib.update(info.node.attrib)
+                  current_parent.append(new_elem)
+                  current_parent = new_elem
+             # Set text on the last one
+             current_parent.text = content
+        # Clear text from source
+        target_node.text = ""
+        # Log structured move
+        log_data["moves"].append({
+            "source": source_innermost.name,
+            "dest": dest_innermost_name,
+            "label": dest_innermost_name.split(" | ")[-1],
+            "value": content,
+            "summary": f"Moved {source_innermost.name.split(' | ')[-1]} content to {dest_innermost_name.split(' | ')[-1]}"
+        })
+    def _remove_empty_parents(self, element):
+         if element is None: return
+         # Check if empty: no text (strip), no children
+         has_text = element.text and element.text.strip()
+         if not has_text and len(element) == 0:
+              parent = element.getparent()
+              if parent is not None:
+                   parent.remove(element)
+                   self._remove_empty_parents(parent)
+import os

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-altair
-pandas
-streamlit

+streamlit
+lxml
+requests

utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+from lxml import etree
+from dataclasses import dataclass
+from typing import List, Optional, Dict, Tuple
+# Constants for DORA XML elements
+TAG_GENRE = "genre"
+TAG_MODS = "mods"
+TAG_ORIGIN_INFO = "originInfo"
+TAG_DATE_ISSUED = "dateIssued"
+TAG_DATE_OTHER = "dateOther"
+ATTR_REPORTING_YEAR = "reporting year"
+@dataclass
+class DateInfo:
+    both_dates_in_same_block: bool
+    date_issued_node: Optional[etree._Element]
+    reporting_year_node: Optional[etree._Element]
+@dataclass
+class NodeInfo:
+    node: etree._Element
+    # Parent is implicit in lxml via getparent()
+    name: str # The computed "path name" for comparison
+    has_child_elements: bool
+class XmlHelper:
+    @staticmethod
+    def parse_xml(file_path_or_content) -> etree._ElementTree:
+        """Parses an XML file or content."""
+        parser = etree.XMLParser(remove_blank_text=True)
+        try:
+            if isinstance(file_path_or_content, str) and os.path.exists(file_path_or_content):
+                tree = etree.parse(file_path_or_content, parser)
+            else:
+                if isinstance(file_path_or_content, bytes):
+                    tree = etree.fromstring(file_path_or_content, parser).getroottree()
+                else:
+                     tree = etree.fromstring(file_path_or_content.encode('utf-8'), parser).getroottree()
+            return tree
+        except Exception as e:
+            raise ValueError(f"Error parsing XML: {e}")
+    @staticmethod
+    def get_genre_node(root: etree._Element) -> Optional[etree._Element]:
+        """Finds the genre element."""
+        # Use simple local-name matching to avoid namespace headaches
+        for elem in root.iter():
+            if etree.QName(elem).localname == TAG_GENRE:
+                return elem
+        return None
+    @staticmethod
+    def find_date_nodes(root: etree._Element) -> DateInfo:
+        """Finds dateIssued and reporting year nodes."""
+        date_issued = None
+        reporting_year = None
+        both_in_same = False
+        origin_infos = []
+        for elem in root.iter():
+            if etree.QName(elem).localname == TAG_ORIGIN_INFO:
+                origin_infos.append(elem)
+        if not origin_infos:
+            raise ValueError("No originInfo elements found in MODS XML")
+        for origin_info in origin_infos:
+            has_issued = False
+            has_reporting = False
+            # Reset for each block to check if THIS block has both
+            current_date_issued = None
+            current_reporting_year = None
+            for child in origin_info:
+                localname = etree.QName(child).localname
+                if localname == TAG_DATE_ISSUED:
+                    has_issued = True
+                    current_date_issued = child
+                elif localname == TAG_DATE_OTHER:
+                    # Check attributes
+                    for attr_name, attr_value in child.attrib.items():
+                        if attr_value == ATTR_REPORTING_YEAR:
+                            has_reporting = True
+                            current_reporting_year = child
+                            break
+            if has_issued:
+                date_issued = current_date_issued
+            if has_reporting:
+                reporting_year = current_reporting_year
+            if has_issued and has_reporting:
+                both_in_same = True
+                # Java code breaks on first occurrence of both in same
+                break
+        return DateInfo(both_in_same, date_issued, reporting_year)
+    @staticmethod
+    def get_node_path_name(element: etree._Element, parent_path: str = "") -> str:
+        """Generates a unique-ish name for the node based on tag and path."""
+        tag = etree.QName(element).localname
+        if tag == "mods":
+            return ""
+        name = tag
+        if element.attrib:
+            # Sort attribs for consistency
+            for k, v in sorted(element.attrib.items()):
+                name += f" [{k}={v}]"
+        if parent_path:
+            return f"{parent_path} | {name}"
+        return name
+    @staticmethod
+    def get_all_nodes_info(root: etree._Element) -> List[NodeInfo]:
+        """Flattens the XML structure to a list of NodeInfo."""
+        nodes = []
+        def traverse(element, parent_path):
+            # Calculate path for current element
+            # Note: The root 'mods' element usually has empty path name in Java logic
+            current_path = XmlHelper.get_node_path_name(element, parent_path)
+            has_child_elements = False
+            for child in element:
+                if isinstance(child, etree._Element):
+                     has_child_elements = True
+                     # Recurse
+                     traverse(child, current_path)
+            # Creating info for CURRENT node
+            # We skip adding 'mods' root itself to the list if its path is empty?
+            # Java: "if (currentNodeName.equalsIgnoreCase(TAGNAME_MODS)) ... resultList.add(nodeInfo)"
+            # It ADDS it, but name is empty?
+            # Java: "String nodeName = ... ? "" : currentNodeName"
+            # Java: "if (!parentNodeName.isEmpty()) nodeName = parentNodeName + ' | ' + nodeName"
+            # If it's root mods, parent_path is None/Empty. nodeName is "".
+            # If it's child of mods, parent_path is "". nodeName is "genre". Result "genre".
+            # So root mods is added with name "".
+            # Note: The comparison logic later uses these names.
+            # If template has root mods (""), input has root mods (""). They match.
+            # So we should include it.
+            nodes.append(NodeInfo(element, current_path, has_child_elements))
+        # Root typically 'mods'
+        # Pass parent_path="" implies we are at top.
+        # But wait, get_node_path_name for root mods returns "".
+        # For child 'genre', parent_path is "". get_node_path_name returns "genre".
+        # This matches Java logic.
+        traverse(root, "")
+        return nodes