RDF Validation Deployment commited on
Commit
a6b66ad
Β·
1 Parent(s): 66fc039

πŸ› Fix validator debugging and error handling

Browse files

- Add test_validator_functionality() to test validator on startup
- Enhanced validate_rdf_tool() with detailed logging and error handling
- Updated validate_rdf_interface() with comprehensive step logging
- Added clean_xml_for_validation() to remove HTML comments from AI output
- Better error messages for import/attribute errors in validator
- Debug preview of XML content being validated
- Warning detection for suspicious validation results (conforms=True with no feedback)
- More detailed re-validation logging to track correction success/failure

This should help diagnose why invalid RDF might be showing as valid.

Files changed (1) hide show
  1. app.py +146 -11
app.py CHANGED
@@ -25,9 +25,19 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
25
  try:
26
  from validator import validate_rdf
27
  VALIDATOR_AVAILABLE = True
28
- except ImportError:
 
 
 
 
 
 
 
 
 
 
29
  VALIDATOR_AVAILABLE = False
30
- print("⚠️ Warning: validator.py not found. Some features may be limited.")
31
 
32
  # Optional: Check if OpenAI and requests are available
33
  try:
@@ -61,6 +71,33 @@ ENABLE_VALIDATION_LOOP = True # Enable validation loop by default
61
  MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
62
  MCP4BIBFRAME_DOCS_ENABLED = True # Set to False to disable doc integration
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def query_bibframe_docs(tool_name: str, params: dict, timeout: int = 10) -> Optional[dict]:
65
  """
66
  Query the MCP4BibFrame documentation API using the MCP protocol.
@@ -332,23 +369,51 @@ def validate_rdf_tool(rdf_content: str, template: str = "monograph") -> dict:
332
  return {"error": "No RDF/XML content provided", "conforms": False}
333
 
334
  if not VALIDATOR_AVAILABLE:
 
335
  return {
336
  "error": "Validator not available - ensure validator.py is present",
337
  "conforms": False
338
  }
339
 
340
  try:
 
 
 
 
341
  conforms, results_text = validate_rdf(rdf_content.encode('utf-8'), template)
342
 
 
 
 
 
 
 
 
 
 
343
  return {
344
  "conforms": conforms,
345
- "results": results_text,
346
  "template": template,
347
  "status": "βœ… Valid RDF" if conforms else "❌ Invalid RDF"
348
  }
349
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  except Exception as e:
351
  logger.error(f"Validation error: {str(e)}")
 
 
352
  return {
353
  "error": f"Validation failed: {str(e)}",
354
  "conforms": False
@@ -1131,6 +1196,43 @@ def extract_xml_from_text(text: str) -> str:
1131
  fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
1132
  return fenced if fenced else text
1133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1134
  # --- Namespace and wrapper helpers to avoid XML parser errors ---
1135
  STANDARD_NAMESPACES = {
1136
  "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
@@ -1215,15 +1317,36 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
1215
  if not rdf_content.strip():
1216
  return "❌ Error", "No RDF/XML data provided", "", "", "", "", ""
1217
 
1218
- # Validate RDF
1219
- prepped_input = ensure_rdf_wrapper_and_namespaces(rdf_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
1220
  result = validate_rdf_tool(prepped_input, template)
1221
 
1222
  if "error" in result:
1223
- return f"❌ Error: {result['error']}", "", "", "", "", "", ""
 
1224
 
1225
  status = result["status"]
1226
  results_text = result["results"]
 
 
 
 
 
 
 
1227
 
1228
  # Filter results if warnings should be excluded
1229
  filtered_results = results_text
@@ -1243,16 +1366,17 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
1243
  filtered_lines.append(line)
1244
 
1245
  filtered_results = '\n'.join(filtered_lines)
 
 
1246
 
1247
  corrected_status = ""
1248
  corrected_results = ""
1249
- steps_log: List[str] = []
1250
- steps_log.append(f"Initial validation: {'PASSED' if result['conforms'] else 'FAILED'} using template '{template}'")
1251
  if not include_warnings:
1252
  steps_log.append("Configured to ignore warnings in AI processing")
1253
  if iterate_until_valid:
1254
  steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
1255
- if result["conforms"]:
1256
  suggestions = "βœ… No issues found! Your RDF/XML is valid according to the selected template."
1257
  corrected_rdf = ""
1258
  corrected_status = "β€”"
@@ -1274,8 +1398,18 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
1274
  )
1275
  # Attempt re-validation of corrected RDF
1276
  try:
1277
- corrected_xml = extract_xml_from_text(corrected_rdf)
 
 
1278
  corrected_xml = ensure_rdf_wrapper_and_namespaces(corrected_xml, original_text=prepped_input, steps_log=steps_log)
 
 
 
 
 
 
 
 
1279
  reval = validate_rdf_tool(corrected_xml, template)
1280
  if "error" in reval:
1281
  corrected_status = f"❌ Re-validation Error: {reval['error']}"
@@ -1284,7 +1418,8 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
1284
  else:
1285
  corrected_status = reval.get("status", "")
1286
  corrected_results = reval.get("results", "")
1287
- steps_log.append(f"Re-validation: {corrected_status}")
 
1288
  except Exception as re_ex:
1289
  corrected_status = f"❌ Re-validation Error: {re_ex}"
1290
  corrected_results = ""
 
25
  try:
26
  from validator import validate_rdf
27
  VALIDATOR_AVAILABLE = True
28
+ # Test that the function is callable
29
+ if not callable(validate_rdf):
30
+ print("⚠️ Warning: validate_rdf is not callable")
31
+ VALIDATOR_AVAILABLE = False
32
+ else:
33
+ print("βœ… Validator module loaded successfully")
34
+ except ImportError as e:
35
+ VALIDATOR_AVAILABLE = False
36
+ print(f"⚠️ Warning: validator.py not found or has import errors: {e}")
37
+ print("Some features may be limited.")
38
+ except Exception as e:
39
  VALIDATOR_AVAILABLE = False
40
+ print(f"⚠️ Warning: Error loading validator: {e}")
41
 
42
  # Optional: Check if OpenAI and requests are available
43
  try:
 
71
  MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
72
  MCP4BIBFRAME_DOCS_ENABLED = True # Set to False to disable doc integration
73
 
74
+ def test_validator_functionality():
75
+ """Test if the validator is actually working"""
76
+ if not VALIDATOR_AVAILABLE:
77
+ print("❌ Validator not available for testing")
78
+ return False
79
+
80
+ try:
81
+ # Test with a simple invalid RDF
82
+ test_rdf = '<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><bf:Work/></rdf:RDF>'
83
+ conforms, results = validate_rdf(test_rdf.encode('utf-8'), 'monograph')
84
+
85
+ # This should fail validation (missing namespace, missing properties)
86
+ if conforms:
87
+ print("⚠️ WARNING: Validator returned 'conforms=True' for invalid RDF. Validator may not be working correctly!")
88
+ return False
89
+ else:
90
+ print(f"βœ… Validator test passed. Got expected failure: {results[:100] if results else 'No results'}")
91
+ return True
92
+
93
+ except Exception as e:
94
+ print(f"❌ Validator test failed with error: {e}")
95
+ return False
96
+
97
+ # Run the test on startup
98
+ if VALIDATOR_AVAILABLE:
99
+ test_validator_functionality()
100
+
101
  def query_bibframe_docs(tool_name: str, params: dict, timeout: int = 10) -> Optional[dict]:
102
  """
103
  Query the MCP4BibFrame documentation API using the MCP protocol.
 
369
  return {"error": "No RDF/XML content provided", "conforms": False}
370
 
371
  if not VALIDATOR_AVAILABLE:
372
+ logger.error("Validator module not available")
373
  return {
374
  "error": "Validator not available - ensure validator.py is present",
375
  "conforms": False
376
  }
377
 
378
  try:
379
+ # Log what we're validating
380
+ logger.info(f"Validating RDF with template '{template}', content length: {len(rdf_content)}")
381
+
382
+ # Call the validator
383
  conforms, results_text = validate_rdf(rdf_content.encode('utf-8'), template)
384
 
385
+ # Debug logging
386
+ logger.info(f"Validation result - conforms: {conforms}, results length: {len(results_text) if results_text else 0}")
387
+
388
+ # If no results text but claims to conform, something might be wrong
389
+ if conforms and (not results_text or len(results_text.strip()) == 0):
390
+ results_text = "Validation passed with no specific feedback."
391
+ elif not conforms and (not results_text or len(results_text.strip()) == 0):
392
+ results_text = "Validation failed but no specific errors were returned. Check the RDF syntax and structure."
393
+
394
  return {
395
  "conforms": conforms,
396
+ "results": results_text if results_text else "",
397
  "template": template,
398
  "status": "βœ… Valid RDF" if conforms else "❌ Invalid RDF"
399
  }
400
 
401
+ except ImportError as e:
402
+ logger.error(f"Import error in validator: {str(e)}")
403
+ return {
404
+ "error": f"Validator import error: {str(e)}. Check that all dependencies are installed.",
405
+ "conforms": False
406
+ }
407
+ except AttributeError as e:
408
+ logger.error(f"Validator function not found: {str(e)}")
409
+ return {
410
+ "error": f"Validator function error: {str(e)}. Check validator.py implementation.",
411
+ "conforms": False
412
+ }
413
  except Exception as e:
414
  logger.error(f"Validation error: {str(e)}")
415
+ import traceback
416
+ logger.error(f"Full traceback: {traceback.format_exc()}")
417
  return {
418
  "error": f"Validation failed: {str(e)}",
419
  "conforms": False
 
1196
  fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
1197
  return fenced if fenced else text
1198
 
1199
+ def clean_xml_for_validation(xml_text: str) -> str:
1200
+ """
1201
+ Clean XML text for validation by removing comments and extra formatting.
1202
+
1203
+ Args:
1204
+ xml_text (str): XML text that may contain comments or formatting
1205
+
1206
+ Returns:
1207
+ str: Clean XML ready for validation
1208
+ """
1209
+ import re
1210
+
1211
+ if not xml_text:
1212
+ return xml_text
1213
+
1214
+ # Remove all HTML comments
1215
+ cleaned = re.sub(r'<!--.*?-->', '', xml_text, flags=re.DOTALL)
1216
+
1217
+ # Remove any leading/trailing whitespace
1218
+ cleaned = cleaned.strip()
1219
+
1220
+ # If the text starts with "```" code fence, extract content
1221
+ if cleaned.startswith("```"):
1222
+ try:
1223
+ # Extract content between code fences
1224
+ parts = cleaned.split("```")
1225
+ if len(parts) >= 3:
1226
+ # Second part should be the XML content
1227
+ cleaned = parts[1]
1228
+ # Remove language identifier if present (e.g., "xml")
1229
+ if cleaned.startswith("xml"):
1230
+ cleaned = cleaned[3:]
1231
+ except:
1232
+ pass
1233
+
1234
+ return cleaned.strip()
1235
+
1236
  # --- Namespace and wrapper helpers to avoid XML parser errors ---
1237
  STANDARD_NAMESPACES = {
1238
  "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
 
1317
  if not rdf_content.strip():
1318
  return "❌ Error", "No RDF/XML data provided", "", "", "", "", ""
1319
 
1320
+ steps_log: List[str] = []
1321
+
1322
+ # Check if validator is available
1323
+ if not VALIDATOR_AVAILABLE:
1324
+ error_msg = "Validator module is not available. Please check that validator.py is present and all dependencies are installed."
1325
+ steps_log.append(f"ERROR: {error_msg}")
1326
+ return "❌ Error", error_msg, "", "\n".join(steps_log) if show_steps else "", "", "", ""
1327
+
1328
+ # Prepare and validate RDF
1329
+ steps_log.append(f"Preparing RDF for validation (original length: {len(rdf_content)} chars)")
1330
+ prepped_input = ensure_rdf_wrapper_and_namespaces(rdf_content, steps_log=steps_log if show_steps else None)
1331
+ steps_log.append(f"Preprocessed RDF (new length: {len(prepped_input)} chars)")
1332
+
1333
+ # Call validation
1334
+ steps_log.append(f"Calling validator with template '{template}'")
1335
  result = validate_rdf_tool(prepped_input, template)
1336
 
1337
  if "error" in result:
1338
+ steps_log.append(f"Validation error: {result['error']}")
1339
+ return f"❌ Error: {result['error']}", "", "", "\n".join(steps_log) if show_steps else "", "", "", ""
1340
 
1341
  status = result["status"]
1342
  results_text = result["results"]
1343
+ conforms = result["conforms"]
1344
+
1345
+ steps_log.append(f"Initial validation: {'PASSED' if conforms else 'FAILED'} using template '{template}'")
1346
+
1347
+ # Log if we got unexpected empty results
1348
+ if not results_text or len(results_text.strip()) == 0:
1349
+ steps_log.append("WARNING: Validator returned empty results text")
1350
 
1351
  # Filter results if warnings should be excluded
1352
  filtered_results = results_text
 
1366
  filtered_lines.append(line)
1367
 
1368
  filtered_results = '\n'.join(filtered_lines)
1369
+ if not include_warnings:
1370
+ steps_log.append("Filtered out warnings from results")
1371
 
1372
  corrected_status = ""
1373
  corrected_results = ""
1374
+
 
1375
  if not include_warnings:
1376
  steps_log.append("Configured to ignore warnings in AI processing")
1377
  if iterate_until_valid:
1378
  steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
1379
+ if conforms:
1380
  suggestions = "βœ… No issues found! Your RDF/XML is valid according to the selected template."
1381
  corrected_rdf = ""
1382
  corrected_status = "β€”"
 
1398
  )
1399
  # Attempt re-validation of corrected RDF
1400
  try:
1401
+ # Clean the corrected output for validation
1402
+ corrected_xml = clean_xml_for_validation(corrected_rdf)
1403
+ corrected_xml = extract_xml_from_text(corrected_xml)
1404
  corrected_xml = ensure_rdf_wrapper_and_namespaces(corrected_xml, original_text=prepped_input, steps_log=steps_log)
1405
+
1406
+ # Debug logging
1407
+ steps_log.append(f"Re-validating cleaned RDF ({len(corrected_xml)} chars)")
1408
+ if show_steps:
1409
+ # Log first 200 chars of what we're validating
1410
+ preview = corrected_xml[:200] + "..." if len(corrected_xml) > 200 else corrected_xml
1411
+ steps_log.append(f"Preview: {preview}")
1412
+
1413
  reval = validate_rdf_tool(corrected_xml, template)
1414
  if "error" in reval:
1415
  corrected_status = f"❌ Re-validation Error: {reval['error']}"
 
1418
  else:
1419
  corrected_status = reval.get("status", "")
1420
  corrected_results = reval.get("results", "")
1421
+ conforms = reval.get('conforms', False)
1422
+ steps_log.append(f"Re-validation: {corrected_status} - Conforms: {conforms}")
1423
  except Exception as re_ex:
1424
  corrected_status = f"❌ Re-validation Error: {re_ex}"
1425
  corrected_results = ""