RDF Validation Deployment commited on
Commit
2602b43
Β·
1 Parent(s): e0e9b68

Skip generating corrected XML when input is already valid; add namespace/wrapper guard for snippets; fix UI wiring; revalidation uses wrapped XML.

Browse files
Files changed (1) hide show
  1. app.py +85 -4
app.py CHANGED
@@ -1022,13 +1022,93 @@ def extract_xml_from_text(text: str) -> str:
1022
  fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
1023
  return fenced if fenced else text
1024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1025
  def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True, include_warnings: bool = False, iterate_until_valid: bool = False, max_attempts: int = 3, show_steps: bool = False):
1026
  """Main validation function for Gradio interface"""
1027
  if not rdf_content.strip():
1028
- return "❌ Error", "No RDF/XML data provided", "", "", "", ""
1029
 
1030
  # Validate RDF
1031
- result = validate_rdf_tool(rdf_content, template)
 
1032
 
1033
  if "error" in result:
1034
  return f"❌ Error: {result['error']}", "", "", "", "", "", ""
@@ -1065,7 +1145,7 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
1065
  steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
1066
  if result["conforms"]:
1067
  suggestions = "βœ… No issues found! Your RDF/XML is valid according to the selected template."
1068
- corrected_rdf = "<!-- Already valid - no corrections needed -->\n" + rdf_content
1069
  corrected_status = "β€”"
1070
  corrected_results = ""
1071
  steps_log.append("No correction needed; record already conforms")
@@ -1086,6 +1166,7 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
1086
  # Attempt re-validation of corrected RDF
1087
  try:
1088
  corrected_xml = extract_xml_from_text(corrected_rdf)
 
1089
  reval = validate_rdf_tool(corrected_xml, template)
1090
  if "error" in reval:
1091
  corrected_status = f"❌ Re-validation Error: {reval['error']}"
@@ -1223,7 +1304,7 @@ def create_interface():
1223
  value="monograph",
1224
  info="Select the SHACL template to validate against"
1225
  )
1226
-
1227
  use_ai_checkbox = gr.Checkbox(
1228
  label="Use AI Features",
1229
  value=True,
 
1022
  fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
1023
  return fenced if fenced else text
1024
 
1025
+ # --- Namespace and wrapper helpers to avoid XML parser errors ---
1026
+ STANDARD_NAMESPACES = {
1027
+ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
1028
+ "bf": "http://id.loc.gov/ontologies/bibframe/",
1029
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
1030
+ "xsd": "http://www.w3.org/2001/XMLSchema#",
1031
+ }
1032
+
1033
+ def _extract_declared_namespaces(xml_text: str) -> dict:
1034
+ import re
1035
+ decls = {}
1036
+ for prefix, uri in re.findall(r"xmlns:([A-Za-z0-9_-]+)=\"([^\"]+)\"", xml_text[:2000]):
1037
+ decls[prefix] = uri
1038
+ return decls
1039
+
1040
+ def _detect_used_prefixes(xml_text: str) -> set:
1041
+ import re
1042
+ used = set()
1043
+ # Tag prefixes like <bf:Work ...> and attribute prefixes like rdf:type="..."
1044
+ for m in re.finditer(r"<\s*([A-Za-z0-9_-]+):[A-Za-z0-9_-]+", xml_text):
1045
+ used.add(m.group(1))
1046
+ for m in re.finditer(r"\s([A-Za-z0-9_-]+):[A-Za-z0-9_-]+=", xml_text):
1047
+ used.add(m.group(1))
1048
+ return used
1049
+
1050
+ def ensure_rdf_wrapper_and_namespaces(xml_text: str, original_text: Optional[str] = None, steps_log: Optional[List[str]] = None) -> str:
1051
+ """Ensure the XML has an <rdf:RDF> wrapper and required xmlns declarations for used prefixes.
1052
+
1053
+ - If wrapper exists, add any missing xmlns: declarations for standard, used prefixes.
1054
+ - If wrapper is missing, wrap the content and include standard namespaces for used prefixes.
1055
+ """
1056
+ if not xml_text or not isinstance(xml_text, str):
1057
+ return xml_text
1058
+ import re
1059
+
1060
+ declared = _extract_declared_namespaces(xml_text)
1061
+ if original_text:
1062
+ # Merge any declarations present in the original input
1063
+ declared.update(_extract_declared_namespaces(original_text))
1064
+
1065
+ used = _detect_used_prefixes(xml_text)
1066
+ # Always consider rdf used for wrapper
1067
+ used.add("rdf")
1068
+
1069
+ # Only inject namespaces for known standards to avoid guessing
1070
+ missing = [p for p in used if p not in declared and p in STANDARD_NAMESPACES]
1071
+ added_attrs = " ".join([f"xmlns:{p}=\"{STANDARD_NAMESPACES[p]}\"" for p in missing])
1072
+
1073
+ has_wrapper = bool(re.search(r"<rdf:RDF[^>]*>", xml_text))
1074
+ updated = xml_text
1075
+
1076
+ if has_wrapper:
1077
+ if added_attrs:
1078
+ # Inject before the closing '>' of the first <rdf:RDF ...>
1079
+ def _inject(match):
1080
+ start_tag = match.group(0)
1081
+ if start_tag.endswith('>'):
1082
+ return start_tag[:-1] + ' ' + added_attrs + '>'
1083
+ return start_tag + ' ' + added_attrs
1084
+ updated = re.sub(r"<rdf:RDF[^>]*>", _inject, updated, count=1)
1085
+ if steps_log is not None and missing:
1086
+ steps_log.append(f"Injected missing namespace declarations: {', '.join(missing)}")
1087
+ else:
1088
+ # Build a wrapper with standard namespaces for used prefixes we know
1089
+ attrs = [f"xmlns:rdf=\"{STANDARD_NAMESPACES['rdf']}\""]
1090
+ for p in used:
1091
+ if p == 'rdf':
1092
+ continue
1093
+ uri = declared.get(p) or STANDARD_NAMESPACES.get(p)
1094
+ if uri:
1095
+ attrs.append(f"xmlns:{p}=\"{uri}\"")
1096
+ wrapper_open = "<rdf:RDF " + " ".join(attrs) + ">\n"
1097
+ wrapper_close = "\n</rdf:RDF>"
1098
+ updated = wrapper_open + xml_text + wrapper_close
1099
+ if steps_log is not None:
1100
+ steps_log.append("Wrapped snippet in <rdf:RDF> with standard namespace declarations")
1101
+
1102
+ return updated
1103
+
1104
  def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True, include_warnings: bool = False, iterate_until_valid: bool = False, max_attempts: int = 3, show_steps: bool = False):
1105
  """Main validation function for Gradio interface"""
1106
  if not rdf_content.strip():
1107
+ return "❌ Error", "No RDF/XML data provided", "", "", "", "", ""
1108
 
1109
  # Validate RDF
1110
+ prepped_input = ensure_rdf_wrapper_and_namespaces(rdf_content)
1111
+ result = validate_rdf_tool(prepped_input, template)
1112
 
1113
  if "error" in result:
1114
  return f"❌ Error: {result['error']}", "", "", "", "", "", ""
 
1145
  steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
1146
  if result["conforms"]:
1147
  suggestions = "βœ… No issues found! Your RDF/XML is valid according to the selected template."
1148
+ corrected_rdf = ""
1149
  corrected_status = "β€”"
1150
  corrected_results = ""
1151
  steps_log.append("No correction needed; record already conforms")
 
1166
  # Attempt re-validation of corrected RDF
1167
  try:
1168
  corrected_xml = extract_xml_from_text(corrected_rdf)
1169
+ corrected_xml = ensure_rdf_wrapper_and_namespaces(corrected_xml, original_text=prepped_input, steps_log=steps_log)
1170
  reval = validate_rdf_tool(corrected_xml, template)
1171
  if "error" in reval:
1172
  corrected_status = f"❌ Re-validation Error: {reval['error']}"
 
1304
  value="monograph",
1305
  info="Select the SHACL template to validate against"
1306
  )
1307
+
1308
  use_ai_checkbox = gr.Checkbox(
1309
  label="Use AI Features",
1310
  value=True,