RDF Validation Deployment
commited on
Commit
Β·
2602b43
1
Parent(s):
e0e9b68
Skip generating corrected XML when input is already valid; add namespace/wrapper guard for snippets; fix UI wiring; revalidation uses wrapped XML.
Browse files
app.py
CHANGED
|
@@ -1022,13 +1022,93 @@ def extract_xml_from_text(text: str) -> str:
|
|
| 1022 |
fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
|
| 1023 |
return fenced if fenced else text
|
| 1024 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True, include_warnings: bool = False, iterate_until_valid: bool = False, max_attempts: int = 3, show_steps: bool = False):
|
| 1026 |
"""Main validation function for Gradio interface"""
|
| 1027 |
if not rdf_content.strip():
|
| 1028 |
-
return "β Error", "No RDF/XML data provided", "", "", "", ""
|
| 1029 |
|
| 1030 |
# Validate RDF
|
| 1031 |
-
|
|
|
|
| 1032 |
|
| 1033 |
if "error" in result:
|
| 1034 |
return f"β Error: {result['error']}", "", "", "", "", "", ""
|
|
@@ -1065,7 +1145,7 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
|
|
| 1065 |
steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
|
| 1066 |
if result["conforms"]:
|
| 1067 |
suggestions = "β
No issues found! Your RDF/XML is valid according to the selected template."
|
| 1068 |
-
corrected_rdf = "
|
| 1069 |
corrected_status = "β"
|
| 1070 |
corrected_results = ""
|
| 1071 |
steps_log.append("No correction needed; record already conforms")
|
|
@@ -1086,6 +1166,7 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
|
|
| 1086 |
# Attempt re-validation of corrected RDF
|
| 1087 |
try:
|
| 1088 |
corrected_xml = extract_xml_from_text(corrected_rdf)
|
|
|
|
| 1089 |
reval = validate_rdf_tool(corrected_xml, template)
|
| 1090 |
if "error" in reval:
|
| 1091 |
corrected_status = f"β Re-validation Error: {reval['error']}"
|
|
@@ -1223,7 +1304,7 @@ def create_interface():
|
|
| 1223 |
value="monograph",
|
| 1224 |
info="Select the SHACL template to validate against"
|
| 1225 |
)
|
| 1226 |
-
|
| 1227 |
use_ai_checkbox = gr.Checkbox(
|
| 1228 |
label="Use AI Features",
|
| 1229 |
value=True,
|
|
|
|
| 1022 |
fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
|
| 1023 |
return fenced if fenced else text
|
| 1024 |
|
| 1025 |
+
# --- Namespace and wrapper helpers to avoid XML parser errors ---
|
| 1026 |
+
STANDARD_NAMESPACES = {
|
| 1027 |
+
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
| 1028 |
+
"bf": "http://id.loc.gov/ontologies/bibframe/",
|
| 1029 |
+
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
|
| 1030 |
+
"xsd": "http://www.w3.org/2001/XMLSchema#",
|
| 1031 |
+
}
|
| 1032 |
+
|
| 1033 |
+
def _extract_declared_namespaces(xml_text: str) -> dict:
|
| 1034 |
+
import re
|
| 1035 |
+
decls = {}
|
| 1036 |
+
for prefix, uri in re.findall(r"xmlns:([A-Za-z0-9_-]+)=\"([^\"]+)\"", xml_text[:2000]):
|
| 1037 |
+
decls[prefix] = uri
|
| 1038 |
+
return decls
|
| 1039 |
+
|
| 1040 |
+
def _detect_used_prefixes(xml_text: str) -> set:
|
| 1041 |
+
import re
|
| 1042 |
+
used = set()
|
| 1043 |
+
# Tag prefixes like <bf:Work ...> and attribute prefixes like rdf:type="..."
|
| 1044 |
+
for m in re.finditer(r"<\s*([A-Za-z0-9_-]+):[A-Za-z0-9_-]+", xml_text):
|
| 1045 |
+
used.add(m.group(1))
|
| 1046 |
+
for m in re.finditer(r"\s([A-Za-z0-9_-]+):[A-Za-z0-9_-]+=", xml_text):
|
| 1047 |
+
used.add(m.group(1))
|
| 1048 |
+
return used
|
| 1049 |
+
|
| 1050 |
+
def ensure_rdf_wrapper_and_namespaces(xml_text: str, original_text: Optional[str] = None, steps_log: Optional[List[str]] = None) -> str:
|
| 1051 |
+
"""Ensure the XML has an <rdf:RDF> wrapper and required xmlns declarations for used prefixes.
|
| 1052 |
+
|
| 1053 |
+
- If wrapper exists, add any missing xmlns: declarations for standard, used prefixes.
|
| 1054 |
+
- If wrapper is missing, wrap the content and include standard namespaces for used prefixes.
|
| 1055 |
+
"""
|
| 1056 |
+
if not xml_text or not isinstance(xml_text, str):
|
| 1057 |
+
return xml_text
|
| 1058 |
+
import re
|
| 1059 |
+
|
| 1060 |
+
declared = _extract_declared_namespaces(xml_text)
|
| 1061 |
+
if original_text:
|
| 1062 |
+
# Merge any declarations present in the original input
|
| 1063 |
+
declared.update(_extract_declared_namespaces(original_text))
|
| 1064 |
+
|
| 1065 |
+
used = _detect_used_prefixes(xml_text)
|
| 1066 |
+
# Always consider rdf used for wrapper
|
| 1067 |
+
used.add("rdf")
|
| 1068 |
+
|
| 1069 |
+
# Only inject namespaces for known standards to avoid guessing
|
| 1070 |
+
missing = [p for p in used if p not in declared and p in STANDARD_NAMESPACES]
|
| 1071 |
+
added_attrs = " ".join([f"xmlns:{p}=\"{STANDARD_NAMESPACES[p]}\"" for p in missing])
|
| 1072 |
+
|
| 1073 |
+
has_wrapper = bool(re.search(r"<rdf:RDF[^>]*>", xml_text))
|
| 1074 |
+
updated = xml_text
|
| 1075 |
+
|
| 1076 |
+
if has_wrapper:
|
| 1077 |
+
if added_attrs:
|
| 1078 |
+
# Inject before the closing '>' of the first <rdf:RDF ...>
|
| 1079 |
+
def _inject(match):
|
| 1080 |
+
start_tag = match.group(0)
|
| 1081 |
+
if start_tag.endswith('>'):
|
| 1082 |
+
return start_tag[:-1] + ' ' + added_attrs + '>'
|
| 1083 |
+
return start_tag + ' ' + added_attrs
|
| 1084 |
+
updated = re.sub(r"<rdf:RDF[^>]*>", _inject, updated, count=1)
|
| 1085 |
+
if steps_log is not None and missing:
|
| 1086 |
+
steps_log.append(f"Injected missing namespace declarations: {', '.join(missing)}")
|
| 1087 |
+
else:
|
| 1088 |
+
# Build a wrapper with standard namespaces for used prefixes we know
|
| 1089 |
+
attrs = [f"xmlns:rdf=\"{STANDARD_NAMESPACES['rdf']}\""]
|
| 1090 |
+
for p in used:
|
| 1091 |
+
if p == 'rdf':
|
| 1092 |
+
continue
|
| 1093 |
+
uri = declared.get(p) or STANDARD_NAMESPACES.get(p)
|
| 1094 |
+
if uri:
|
| 1095 |
+
attrs.append(f"xmlns:{p}=\"{uri}\"")
|
| 1096 |
+
wrapper_open = "<rdf:RDF " + " ".join(attrs) + ">\n"
|
| 1097 |
+
wrapper_close = "\n</rdf:RDF>"
|
| 1098 |
+
updated = wrapper_open + xml_text + wrapper_close
|
| 1099 |
+
if steps_log is not None:
|
| 1100 |
+
steps_log.append("Wrapped snippet in <rdf:RDF> with standard namespace declarations")
|
| 1101 |
+
|
| 1102 |
+
return updated
|
| 1103 |
+
|
| 1104 |
def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True, include_warnings: bool = False, iterate_until_valid: bool = False, max_attempts: int = 3, show_steps: bool = False):
|
| 1105 |
"""Main validation function for Gradio interface"""
|
| 1106 |
if not rdf_content.strip():
|
| 1107 |
+
return "β Error", "No RDF/XML data provided", "", "", "", "", ""
|
| 1108 |
|
| 1109 |
# Validate RDF
|
| 1110 |
+
prepped_input = ensure_rdf_wrapper_and_namespaces(rdf_content)
|
| 1111 |
+
result = validate_rdf_tool(prepped_input, template)
|
| 1112 |
|
| 1113 |
if "error" in result:
|
| 1114 |
return f"β Error: {result['error']}", "", "", "", "", "", ""
|
|
|
|
| 1145 |
steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
|
| 1146 |
if result["conforms"]:
|
| 1147 |
suggestions = "β
No issues found! Your RDF/XML is valid according to the selected template."
|
| 1148 |
+
corrected_rdf = ""
|
| 1149 |
corrected_status = "β"
|
| 1150 |
corrected_results = ""
|
| 1151 |
steps_log.append("No correction needed; record already conforms")
|
|
|
|
| 1166 |
# Attempt re-validation of corrected RDF
|
| 1167 |
try:
|
| 1168 |
corrected_xml = extract_xml_from_text(corrected_rdf)
|
| 1169 |
+
corrected_xml = ensure_rdf_wrapper_and_namespaces(corrected_xml, original_text=prepped_input, steps_log=steps_log)
|
| 1170 |
reval = validate_rdf_tool(corrected_xml, template)
|
| 1171 |
if "error" in reval:
|
| 1172 |
corrected_status = f"β Re-validation Error: {reval['error']}"
|
|
|
|
| 1304 |
value="monograph",
|
| 1305 |
info="Select the SHACL template to validate against"
|
| 1306 |
)
|
| 1307 |
+
|
| 1308 |
use_ai_checkbox = gr.Checkbox(
|
| 1309 |
label="Use AI Features",
|
| 1310 |
value=True,
|