Spaces:

jimfhahn
/

mcp4rdf

Sleeping

RDF Validation Deployment commited on Oct 4

Commit

a6b66ad

1 Parent(s): 66fc039

🐛 Fix validator debugging and error handling

- Add test_validator_functionality() to test validator on startup
- Enhanced validate_rdf_tool() with detailed logging and error handling
- Updated validate_rdf_interface() with comprehensive step logging
- Added clean_xml_for_validation() to remove HTML comments from AI output
- Better error messages for import/attribute errors in validator
- Debug preview of XML content being validated
- Warning detection for suspicious validation results (conforms=True with no feedback)
- More detailed re-validation logging to track correction success/failure

This should help diagnose why invalid RDF might be showing as valid.

Files changed (1) hide show

app.py +146 -11

app.py CHANGED Viewed

@@ -25,9 +25,19 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 try:
     from validator import validate_rdf
     VALIDATOR_AVAILABLE = True
-except ImportError:
     VALIDATOR_AVAILABLE = False
-    print("⚠️ Warning: validator.py not found. Some features may be limited.")
 # Optional: Check if OpenAI and requests are available
 try:
@@ -61,6 +71,33 @@ ENABLE_VALIDATION_LOOP = True  # Enable validation loop by default
 MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
 MCP4BIBFRAME_DOCS_ENABLED = True  # Set to False to disable doc integration
 def query_bibframe_docs(tool_name: str, params: dict, timeout: int = 10) -> Optional[dict]:
     """
     Query the MCP4BibFrame documentation API using the MCP protocol.
@@ -332,23 +369,51 @@ def validate_rdf_tool(rdf_content: str, template: str = "monograph") -> dict:
         return {"error": "No RDF/XML content provided", "conforms": False}
     if not VALIDATOR_AVAILABLE:
         return {
             "error": "Validator not available - ensure validator.py is present",
             "conforms": False
         }
     try:
         conforms, results_text = validate_rdf(rdf_content.encode('utf-8'), template)
         return {
             "conforms": conforms,
-            "results": results_text,
             "template": template,
             "status": "✅ Valid RDF" if conforms else "❌ Invalid RDF"
         }
     except Exception as e:
         logger.error(f"Validation error: {str(e)}")
         return {
             "error": f"Validation failed: {str(e)}",
             "conforms": False
@@ -1131,6 +1196,43 @@ def extract_xml_from_text(text: str) -> str:
     fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
     return fenced if fenced else text
 # --- Namespace and wrapper helpers to avoid XML parser errors ---
 STANDARD_NAMESPACES = {
     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
@@ -1215,15 +1317,36 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
     if not rdf_content.strip():
         return "❌ Error", "No RDF/XML data provided", "", "", "", "", ""
-    # Validate RDF
-    prepped_input = ensure_rdf_wrapper_and_namespaces(rdf_content)
     result = validate_rdf_tool(prepped_input, template)
     if "error" in result:
-        return f"❌ Error: {result['error']}", "", "", "", "", "", ""
     status = result["status"]
     results_text = result["results"]
     # Filter results if warnings should be excluded
     filtered_results = results_text
@@ -1243,16 +1366,17 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
                 filtered_lines.append(line)
         filtered_results = '\n'.join(filtered_lines)
     corrected_status = ""
     corrected_results = ""
-    steps_log: List[str] = []
-    steps_log.append(f"Initial validation: {'PASSED' if result['conforms'] else 'FAILED'} using template '{template}'")
     if not include_warnings:
         steps_log.append("Configured to ignore warnings in AI processing")
     if iterate_until_valid:
         steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
-    if result["conforms"]:
         suggestions = "✅ No issues found! Your RDF/XML is valid according to the selected template."
         corrected_rdf = ""
         corrected_status = "—"
@@ -1274,8 +1398,18 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
             )
             # Attempt re-validation of corrected RDF
             try:
-                corrected_xml = extract_xml_from_text(corrected_rdf)
                 corrected_xml = ensure_rdf_wrapper_and_namespaces(corrected_xml, original_text=prepped_input, steps_log=steps_log)
                 reval = validate_rdf_tool(corrected_xml, template)
                 if "error" in reval:
                     corrected_status = f"❌ Re-validation Error: {reval['error']}"
@@ -1284,7 +1418,8 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
                 else:
                     corrected_status = reval.get("status", "")
                     corrected_results = reval.get("results", "")
-                    steps_log.append(f"Re-validation: {corrected_status}")
             except Exception as re_ex:
                 corrected_status = f"❌ Re-validation Error: {re_ex}"
                 corrected_results = ""

 try:
     from validator import validate_rdf
     VALIDATOR_AVAILABLE = True
+    # Test that the function is callable
+    if not callable(validate_rdf):
+        print("⚠️ Warning: validate_rdf is not callable")
+        VALIDATOR_AVAILABLE = False
+    else:
+        print("✅ Validator module loaded successfully")
+except ImportError as e:
+    VALIDATOR_AVAILABLE = False
+    print(f"⚠️ Warning: validator.py not found or has import errors: {e}")
+    print("Some features may be limited.")
+except Exception as e:
     VALIDATOR_AVAILABLE = False
+    print(f"⚠️ Warning: Error loading validator: {e}")
 # Optional: Check if OpenAI and requests are available
 try:
 MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
 MCP4BIBFRAME_DOCS_ENABLED = True  # Set to False to disable doc integration
+def test_validator_functionality():
+    """Test if the validator is actually working"""
+    if not VALIDATOR_AVAILABLE:
+        print("❌ Validator not available for testing")
+        return False
+    try:
+        # Test with a simple invalid RDF
+        test_rdf = '<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><bf:Work/></rdf:RDF>'
+        conforms, results = validate_rdf(test_rdf.encode('utf-8'), 'monograph')
+        # This should fail validation (missing namespace, missing properties)
+        if conforms:
+            print("⚠️ WARNING: Validator returned 'conforms=True' for invalid RDF. Validator may not be working correctly!")
+            return False
+        else:
+            print(f"✅ Validator test passed. Got expected failure: {results[:100] if results else 'No results'}")
+            return True
+    except Exception as e:
+        print(f"❌ Validator test failed with error: {e}")
+        return False
+# Run the test on startup
+if VALIDATOR_AVAILABLE:
+    test_validator_functionality()
 def query_bibframe_docs(tool_name: str, params: dict, timeout: int = 10) -> Optional[dict]:
     """
     Query the MCP4BibFrame documentation API using the MCP protocol.
         return {"error": "No RDF/XML content provided", "conforms": False}
     if not VALIDATOR_AVAILABLE:
+        logger.error("Validator module not available")
         return {
             "error": "Validator not available - ensure validator.py is present",
             "conforms": False
         }
     try:
+        # Log what we're validating
+        logger.info(f"Validating RDF with template '{template}', content length: {len(rdf_content)}")
+        # Call the validator
         conforms, results_text = validate_rdf(rdf_content.encode('utf-8'), template)
+        # Debug logging
+        logger.info(f"Validation result - conforms: {conforms}, results length: {len(results_text) if results_text else 0}")
+        # If no results text but claims to conform, something might be wrong
+        if conforms and (not results_text or len(results_text.strip()) == 0):
+            results_text = "Validation passed with no specific feedback."
+        elif not conforms and (not results_text or len(results_text.strip()) == 0):
+            results_text = "Validation failed but no specific errors were returned. Check the RDF syntax and structure."
         return {
             "conforms": conforms,
+            "results": results_text if results_text else "",
             "template": template,
             "status": "✅ Valid RDF" if conforms else "❌ Invalid RDF"
         }
+    except ImportError as e:
+        logger.error(f"Import error in validator: {str(e)}")
+        return {
+            "error": f"Validator import error: {str(e)}. Check that all dependencies are installed.",
+            "conforms": False
+        }
+    except AttributeError as e:
+        logger.error(f"Validator function not found: {str(e)}")
+        return {
+            "error": f"Validator function error: {str(e)}. Check validator.py implementation.",
+            "conforms": False
+        }
     except Exception as e:
         logger.error(f"Validation error: {str(e)}")
+        import traceback
+        logger.error(f"Full traceback: {traceback.format_exc()}")
         return {
             "error": f"Validation failed: {str(e)}",
             "conforms": False
     fenced = re.sub(r"^```[a-zA-Z]*\n|```$", "", text.strip())
     return fenced if fenced else text
+def clean_xml_for_validation(xml_text: str) -> str:
+    """
+    Clean XML text for validation by removing comments and extra formatting.
+    Args:
+        xml_text (str): XML text that may contain comments or formatting
+    Returns:
+        str: Clean XML ready for validation
+    """
+    import re
+    if not xml_text:
+        return xml_text
+    # Remove all HTML comments
+    cleaned = re.sub(r'<!--.*?-->', '', xml_text, flags=re.DOTALL)
+    # Remove any leading/trailing whitespace
+    cleaned = cleaned.strip()
+    # If the text starts with "```" code fence, extract content
+    if cleaned.startswith("```"):
+        try:
+            # Extract content between code fences
+            parts = cleaned.split("```")
+            if len(parts) >= 3:
+                # Second part should be the XML content
+                cleaned = parts[1]
+                # Remove language identifier if present (e.g., "xml")
+                if cleaned.startswith("xml"):
+                    cleaned = cleaned[3:]
+        except:
+            pass
+    return cleaned.strip()
 # --- Namespace and wrapper helpers to avoid XML parser errors ---
 STANDARD_NAMESPACES = {
     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
     if not rdf_content.strip():
         return "❌ Error", "No RDF/XML data provided", "", "", "", "", ""
+    steps_log: List[str] = []
+    # Check if validator is available
+    if not VALIDATOR_AVAILABLE:
+        error_msg = "Validator module is not available. Please check that validator.py is present and all dependencies are installed."
+        steps_log.append(f"ERROR: {error_msg}")
+        return "❌ Error", error_msg, "", "\n".join(steps_log) if show_steps else "", "", "", ""
+    # Prepare and validate RDF
+    steps_log.append(f"Preparing RDF for validation (original length: {len(rdf_content)} chars)")
+    prepped_input = ensure_rdf_wrapper_and_namespaces(rdf_content, steps_log=steps_log if show_steps else None)
+    steps_log.append(f"Preprocessed RDF (new length: {len(prepped_input)} chars)")
+    # Call validation
+    steps_log.append(f"Calling validator with template '{template}'")
     result = validate_rdf_tool(prepped_input, template)
     if "error" in result:
+        steps_log.append(f"Validation error: {result['error']}")
+        return f"❌ Error: {result['error']}", "", "", "\n".join(steps_log) if show_steps else "", "", "", ""
     status = result["status"]
     results_text = result["results"]
+    conforms = result["conforms"]
+    steps_log.append(f"Initial validation: {'PASSED' if conforms else 'FAILED'} using template '{template}'")
+    # Log if we got unexpected empty results
+    if not results_text or len(results_text.strip()) == 0:
+        steps_log.append("WARNING: Validator returned empty results text")
     # Filter results if warnings should be excluded
     filtered_results = results_text
                 filtered_lines.append(line)
         filtered_results = '\n'.join(filtered_lines)
+        if not include_warnings:
+            steps_log.append("Filtered out warnings from results")
     corrected_status = ""
     corrected_results = ""
     if not include_warnings:
         steps_log.append("Configured to ignore warnings in AI processing")
     if iterate_until_valid:
         steps_log.append(f"Iteration enabled with max_attempts={max_attempts}")
+    if conforms:
         suggestions = "✅ No issues found! Your RDF/XML is valid according to the selected template."
         corrected_rdf = ""
         corrected_status = "—"
             )
             # Attempt re-validation of corrected RDF
             try:
+                # Clean the corrected output for validation
+                corrected_xml = clean_xml_for_validation(corrected_rdf)
+                corrected_xml = extract_xml_from_text(corrected_xml)
                 corrected_xml = ensure_rdf_wrapper_and_namespaces(corrected_xml, original_text=prepped_input, steps_log=steps_log)
+                # Debug logging
+                steps_log.append(f"Re-validating cleaned RDF ({len(corrected_xml)} chars)")
+                if show_steps:
+                    # Log first 200 chars of what we're validating
+                    preview = corrected_xml[:200] + "..." if len(corrected_xml) > 200 else corrected_xml
+                    steps_log.append(f"Preview: {preview}")
                 reval = validate_rdf_tool(corrected_xml, template)
                 if "error" in reval:
                     corrected_status = f"❌ Re-validation Error: {reval['error']}"
                 else:
                     corrected_status = reval.get("status", "")
                     corrected_results = reval.get("results", "")
+                    conforms = reval.get('conforms', False)
+                    steps_log.append(f"Re-validation: {corrected_status} - Conforms: {conforms}")
             except Exception as re_ex:
                 corrected_status = f"❌ Re-validation Error: {re_ex}"
                 corrected_results = ""