RDF Validation Deployment
commited on
Commit
Β·
a6b66ad
1
Parent(s):
66fc039
π Fix validator debugging and error handling
Browse files- Add test_validator_functionality() to test validator on startup
- Enhanced validate_rdf_tool() with detailed logging and error handling
- Updated validate_rdf_interface() with comprehensive step logging
- Added clean_xml_for_validation() to remove HTML comments from AI output
- Better error messages for import/attribute errors in validator
- Debug preview of XML content being validated
- Warning detection for suspicious validation results (conforms=True with no feedback)
- More detailed re-validation logging to track correction success/failure
This should help diagnose why invalid RDF might be showing as valid.
app.py
CHANGED
|
@@ -25,9 +25,19 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
| 25 |
try:
|
| 26 |
from validator import validate_rdf
|
| 27 |
VALIDATOR_AVAILABLE = True
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
VALIDATOR_AVAILABLE = False
|
| 30 |
-
print("β οΈ Warning:
|
| 31 |
|
| 32 |
# Optional: Check if OpenAI and requests are available
|
| 33 |
try:
|
|
@@ -61,6 +71,33 @@ ENABLE_VALIDATION_LOOP = True # Enable validation loop by default
|
|
| 61 |
MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
|
| 62 |
MCP4BIBFRAME_DOCS_ENABLED = True # Set to False to disable doc integration
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def query_bibframe_docs(tool_name: str, params: dict, timeout: int = 10) -> Optional[dict]:
|
| 65 |
"""
|
| 66 |
Query the MCP4BibFrame documentation API using the MCP protocol.
|
|
@@ -332,23 +369,51 @@ def validate_rdf_tool(rdf_content: str, template: str = "monograph") -> dict:
|
|
| 332 |
return {"error": "No RDF/XML content provided", "conforms": False}
|
| 333 |
|
| 334 |
if not VALIDATOR_AVAILABLE:
|
|
|
|
| 335 |
return {
|
| 336 |
"error": "Validator not available - ensure validator.py is present",
|
| 337 |
"conforms": False
|
| 338 |
}
|
| 339 |
|
| 340 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
conforms, results_text = validate_rdf(rdf_content.encode('utf-8'), template)
|
| 342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
return {
|
| 344 |
"conforms": conforms,
|
| 345 |
-
"results": results_text,
|
| 346 |
"template": template,
|
| 347 |
"status": "β
Valid RDF" if conforms else "β Invalid RDF"
|
| 348 |
}
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
except Exception as e:
|
| 351 |
logger.error(f"Validation error: {str(e)}")
|
|
|
|
|
|
|
| 352 |
return {
|
| 353 |
"error": f"Validation failed: {str(e)}",
|
| 354 |
"conforms": False
|
|
@@ -1131,6 +1196,43 @@ def extract_xml_from_text(text: str) -> str:
|
|
| 1131 |
fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
|
| 1132 |
return fenced if fenced else text
|
| 1133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1134 |
# --- Namespace and wrapper helpers to avoid XML parser errors ---
|
| 1135 |
STANDARD_NAMESPACES = {
|
| 1136 |
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
|
@@ -1215,15 +1317,36 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
|
|
| 1215 |
if not rdf_content.strip():
|
| 1216 |
return "β Error", "No RDF/XML data provided", "", "", "", "", ""
|
| 1217 |
|
| 1218 |
-
|
| 1219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1220 |
result = validate_rdf_tool(prepped_input, template)
|
| 1221 |
|
| 1222 |
if "error" in result:
|
| 1223 |
-
|
|
|
|
| 1224 |
|
| 1225 |
status = result["status"]
|
| 1226 |
results_text = result["results"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1227 |
|
| 1228 |
# Filter results if warnings should be excluded
|
| 1229 |
filtered_results = results_text
|
|
@@ -1243,16 +1366,17 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
|
|
| 1243 |
filtered_lines.append(line)
|
| 1244 |
|
| 1245 |
filtered_results = '\n'.join(filtered_lines)
|
|
|
|
|
|
|
| 1246 |
|
| 1247 |
corrected_status = ""
|
| 1248 |
corrected_results = ""
|
| 1249 |
-
|
| 1250 |
-
steps_log.append(f"Initial validation: {'PASSED' if result['conforms'] else 'FAILED'} using template '{template}'")
|
| 1251 |
if not include_warnings:
|
| 1252 |
steps_log.append("Configured to ignore warnings in AI processing")
|
| 1253 |
if iterate_until_valid:
|
| 1254 |
steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
|
| 1255 |
-
if
|
| 1256 |
suggestions = "β
No issues found! Your RDF/XML is valid according to the selected template."
|
| 1257 |
corrected_rdf = ""
|
| 1258 |
corrected_status = "β"
|
|
@@ -1274,8 +1398,18 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
|
|
| 1274 |
)
|
| 1275 |
# Attempt re-validation of corrected RDF
|
| 1276 |
try:
|
| 1277 |
-
|
|
|
|
|
|
|
| 1278 |
corrected_xml = ensure_rdf_wrapper_and_namespaces(corrected_xml, original_text=prepped_input, steps_log=steps_log)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1279 |
reval = validate_rdf_tool(corrected_xml, template)
|
| 1280 |
if "error" in reval:
|
| 1281 |
corrected_status = f"β Re-validation Error: {reval['error']}"
|
|
@@ -1284,7 +1418,8 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
|
|
| 1284 |
else:
|
| 1285 |
corrected_status = reval.get("status", "")
|
| 1286 |
corrected_results = reval.get("results", "")
|
| 1287 |
-
|
|
|
|
| 1288 |
except Exception as re_ex:
|
| 1289 |
corrected_status = f"β Re-validation Error: {re_ex}"
|
| 1290 |
corrected_results = ""
|
|
|
|
| 25 |
try:
|
| 26 |
from validator import validate_rdf
|
| 27 |
VALIDATOR_AVAILABLE = True
|
| 28 |
+
# Test that the function is callable
|
| 29 |
+
if not callable(validate_rdf):
|
| 30 |
+
print("β οΈ Warning: validate_rdf is not callable")
|
| 31 |
+
VALIDATOR_AVAILABLE = False
|
| 32 |
+
else:
|
| 33 |
+
print("β
Validator module loaded successfully")
|
| 34 |
+
except ImportError as e:
|
| 35 |
+
VALIDATOR_AVAILABLE = False
|
| 36 |
+
print(f"β οΈ Warning: validator.py not found or has import errors: {e}")
|
| 37 |
+
print("Some features may be limited.")
|
| 38 |
+
except Exception as e:
|
| 39 |
VALIDATOR_AVAILABLE = False
|
| 40 |
+
print(f"β οΈ Warning: Error loading validator: {e}")
|
| 41 |
|
| 42 |
# Optional: Check if OpenAI and requests are available
|
| 43 |
try:
|
|
|
|
| 71 |
MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
|
| 72 |
MCP4BIBFRAME_DOCS_ENABLED = True # Set to False to disable doc integration
|
| 73 |
|
| 74 |
+
def test_validator_functionality():
|
| 75 |
+
"""Test if the validator is actually working"""
|
| 76 |
+
if not VALIDATOR_AVAILABLE:
|
| 77 |
+
print("β Validator not available for testing")
|
| 78 |
+
return False
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
# Test with a simple invalid RDF
|
| 82 |
+
test_rdf = '<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><bf:Work/></rdf:RDF>'
|
| 83 |
+
conforms, results = validate_rdf(test_rdf.encode('utf-8'), 'monograph')
|
| 84 |
+
|
| 85 |
+
# This should fail validation (missing namespace, missing properties)
|
| 86 |
+
if conforms:
|
| 87 |
+
print("β οΈ WARNING: Validator returned 'conforms=True' for invalid RDF. Validator may not be working correctly!")
|
| 88 |
+
return False
|
| 89 |
+
else:
|
| 90 |
+
print(f"β
Validator test passed. Got expected failure: {results[:100] if results else 'No results'}")
|
| 91 |
+
return True
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"β Validator test failed with error: {e}")
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
# Run the test on startup
|
| 98 |
+
if VALIDATOR_AVAILABLE:
|
| 99 |
+
test_validator_functionality()
|
| 100 |
+
|
| 101 |
def query_bibframe_docs(tool_name: str, params: dict, timeout: int = 10) -> Optional[dict]:
|
| 102 |
"""
|
| 103 |
Query the MCP4BibFrame documentation API using the MCP protocol.
|
|
|
|
| 369 |
return {"error": "No RDF/XML content provided", "conforms": False}
|
| 370 |
|
| 371 |
if not VALIDATOR_AVAILABLE:
|
| 372 |
+
logger.error("Validator module not available")
|
| 373 |
return {
|
| 374 |
"error": "Validator not available - ensure validator.py is present",
|
| 375 |
"conforms": False
|
| 376 |
}
|
| 377 |
|
| 378 |
try:
|
| 379 |
+
# Log what we're validating
|
| 380 |
+
logger.info(f"Validating RDF with template '{template}', content length: {len(rdf_content)}")
|
| 381 |
+
|
| 382 |
+
# Call the validator
|
| 383 |
conforms, results_text = validate_rdf(rdf_content.encode('utf-8'), template)
|
| 384 |
|
| 385 |
+
# Debug logging
|
| 386 |
+
logger.info(f"Validation result - conforms: {conforms}, results length: {len(results_text) if results_text else 0}")
|
| 387 |
+
|
| 388 |
+
# If no results text but claims to conform, something might be wrong
|
| 389 |
+
if conforms and (not results_text or len(results_text.strip()) == 0):
|
| 390 |
+
results_text = "Validation passed with no specific feedback."
|
| 391 |
+
elif not conforms and (not results_text or len(results_text.strip()) == 0):
|
| 392 |
+
results_text = "Validation failed but no specific errors were returned. Check the RDF syntax and structure."
|
| 393 |
+
|
| 394 |
return {
|
| 395 |
"conforms": conforms,
|
| 396 |
+
"results": results_text if results_text else "",
|
| 397 |
"template": template,
|
| 398 |
"status": "β
Valid RDF" if conforms else "β Invalid RDF"
|
| 399 |
}
|
| 400 |
|
| 401 |
+
except ImportError as e:
|
| 402 |
+
logger.error(f"Import error in validator: {str(e)}")
|
| 403 |
+
return {
|
| 404 |
+
"error": f"Validator import error: {str(e)}. Check that all dependencies are installed.",
|
| 405 |
+
"conforms": False
|
| 406 |
+
}
|
| 407 |
+
except AttributeError as e:
|
| 408 |
+
logger.error(f"Validator function not found: {str(e)}")
|
| 409 |
+
return {
|
| 410 |
+
"error": f"Validator function error: {str(e)}. Check validator.py implementation.",
|
| 411 |
+
"conforms": False
|
| 412 |
+
}
|
| 413 |
except Exception as e:
|
| 414 |
logger.error(f"Validation error: {str(e)}")
|
| 415 |
+
import traceback
|
| 416 |
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
| 417 |
return {
|
| 418 |
"error": f"Validation failed: {str(e)}",
|
| 419 |
"conforms": False
|
|
|
|
| 1196 |
fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
|
| 1197 |
return fenced if fenced else text
|
| 1198 |
|
| 1199 |
+
def clean_xml_for_validation(xml_text: str) -> str:
|
| 1200 |
+
"""
|
| 1201 |
+
Clean XML text for validation by removing comments and extra formatting.
|
| 1202 |
+
|
| 1203 |
+
Args:
|
| 1204 |
+
xml_text (str): XML text that may contain comments or formatting
|
| 1205 |
+
|
| 1206 |
+
Returns:
|
| 1207 |
+
str: Clean XML ready for validation
|
| 1208 |
+
"""
|
| 1209 |
+
import re
|
| 1210 |
+
|
| 1211 |
+
if not xml_text:
|
| 1212 |
+
return xml_text
|
| 1213 |
+
|
| 1214 |
+
# Remove all HTML comments
|
| 1215 |
+
cleaned = re.sub(r'<!--.*?-->', '', xml_text, flags=re.DOTALL)
|
| 1216 |
+
|
| 1217 |
+
# Remove any leading/trailing whitespace
|
| 1218 |
+
cleaned = cleaned.strip()
|
| 1219 |
+
|
| 1220 |
+
# If the text starts with "```" code fence, extract content
|
| 1221 |
+
if cleaned.startswith("```"):
|
| 1222 |
+
try:
|
| 1223 |
+
# Extract content between code fences
|
| 1224 |
+
parts = cleaned.split("```")
|
| 1225 |
+
if len(parts) >= 3:
|
| 1226 |
+
# Second part should be the XML content
|
| 1227 |
+
cleaned = parts[1]
|
| 1228 |
+
# Remove language identifier if present (e.g., "xml")
|
| 1229 |
+
if cleaned.startswith("xml"):
|
| 1230 |
+
cleaned = cleaned[3:]
|
| 1231 |
+
except:
|
| 1232 |
+
pass
|
| 1233 |
+
|
| 1234 |
+
return cleaned.strip()
|
| 1235 |
+
|
| 1236 |
# --- Namespace and wrapper helpers to avoid XML parser errors ---
|
| 1237 |
STANDARD_NAMESPACES = {
|
| 1238 |
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
|
|
|
| 1317 |
if not rdf_content.strip():
|
| 1318 |
return "β Error", "No RDF/XML data provided", "", "", "", "", ""
|
| 1319 |
|
| 1320 |
+
steps_log: List[str] = []
|
| 1321 |
+
|
| 1322 |
+
# Check if validator is available
|
| 1323 |
+
if not VALIDATOR_AVAILABLE:
|
| 1324 |
+
error_msg = "Validator module is not available. Please check that validator.py is present and all dependencies are installed."
|
| 1325 |
+
steps_log.append(f"ERROR: {error_msg}")
|
| 1326 |
+
return "β Error", error_msg, "", "\n".join(steps_log) if show_steps else "", "", "", ""
|
| 1327 |
+
|
| 1328 |
+
# Prepare and validate RDF
|
| 1329 |
+
steps_log.append(f"Preparing RDF for validation (original length: {len(rdf_content)} chars)")
|
| 1330 |
+
prepped_input = ensure_rdf_wrapper_and_namespaces(rdf_content, steps_log=steps_log if show_steps else None)
|
| 1331 |
+
steps_log.append(f"Preprocessed RDF (new length: {len(prepped_input)} chars)")
|
| 1332 |
+
|
| 1333 |
+
# Call validation
|
| 1334 |
+
steps_log.append(f"Calling validator with template '{template}'")
|
| 1335 |
result = validate_rdf_tool(prepped_input, template)
|
| 1336 |
|
| 1337 |
if "error" in result:
|
| 1338 |
+
steps_log.append(f"Validation error: {result['error']}")
|
| 1339 |
+
return f"β Error: {result['error']}", "", "", "\n".join(steps_log) if show_steps else "", "", "", ""
|
| 1340 |
|
| 1341 |
status = result["status"]
|
| 1342 |
results_text = result["results"]
|
| 1343 |
+
conforms = result["conforms"]
|
| 1344 |
+
|
| 1345 |
+
steps_log.append(f"Initial validation: {'PASSED' if conforms else 'FAILED'} using template '{template}'")
|
| 1346 |
+
|
| 1347 |
+
# Log if we got unexpected empty results
|
| 1348 |
+
if not results_text or len(results_text.strip()) == 0:
|
| 1349 |
+
steps_log.append("WARNING: Validator returned empty results text")
|
| 1350 |
|
| 1351 |
# Filter results if warnings should be excluded
|
| 1352 |
filtered_results = results_text
|
|
|
|
| 1366 |
filtered_lines.append(line)
|
| 1367 |
|
| 1368 |
filtered_results = '\n'.join(filtered_lines)
|
| 1369 |
+
if not include_warnings:
|
| 1370 |
+
steps_log.append("Filtered out warnings from results")
|
| 1371 |
|
| 1372 |
corrected_status = ""
|
| 1373 |
corrected_results = ""
|
| 1374 |
+
|
|
|
|
| 1375 |
if not include_warnings:
|
| 1376 |
steps_log.append("Configured to ignore warnings in AI processing")
|
| 1377 |
if iterate_until_valid:
|
| 1378 |
steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
|
| 1379 |
+
if conforms:
|
| 1380 |
suggestions = "β
No issues found! Your RDF/XML is valid according to the selected template."
|
| 1381 |
corrected_rdf = ""
|
| 1382 |
corrected_status = "β"
|
|
|
|
| 1398 |
)
|
| 1399 |
# Attempt re-validation of corrected RDF
|
| 1400 |
try:
|
| 1401 |
+
# Clean the corrected output for validation
|
| 1402 |
+
corrected_xml = clean_xml_for_validation(corrected_rdf)
|
| 1403 |
+
corrected_xml = extract_xml_from_text(corrected_xml)
|
| 1404 |
corrected_xml = ensure_rdf_wrapper_and_namespaces(corrected_xml, original_text=prepped_input, steps_log=steps_log)
|
| 1405 |
+
|
| 1406 |
+
# Debug logging
|
| 1407 |
+
steps_log.append(f"Re-validating cleaned RDF ({len(corrected_xml)} chars)")
|
| 1408 |
+
if show_steps:
|
| 1409 |
+
# Log first 200 chars of what we're validating
|
| 1410 |
+
preview = corrected_xml[:200] + "..." if len(corrected_xml) > 200 else corrected_xml
|
| 1411 |
+
steps_log.append(f"Preview: {preview}")
|
| 1412 |
+
|
| 1413 |
reval = validate_rdf_tool(corrected_xml, template)
|
| 1414 |
if "error" in reval:
|
| 1415 |
corrected_status = f"β Re-validation Error: {reval['error']}"
|
|
|
|
| 1418 |
else:
|
| 1419 |
corrected_status = reval.get("status", "")
|
| 1420 |
corrected_results = reval.get("results", "")
|
| 1421 |
+
conforms = reval.get('conforms', False)
|
| 1422 |
+
steps_log.append(f"Re-validation: {corrected_status} - Conforms: {conforms}")
|
| 1423 |
except Exception as re_ex:
|
| 1424 |
corrected_status = f"β Re-validation Error: {re_ex}"
|
| 1425 |
corrected_results = ""
|