Spaces:

jimfhahn
/

mcp4rdf

Sleeping

App Files Files Community

RDF Validation Deployment commited on Oct 4

Commit

af9e2c1

1 Parent(s): 48a9d7d

streamline validation

Browse files

Files changed (1) hide show

app.py +261 -4

app.py CHANGED Viewed

@@ -71,6 +71,10 @@ ENABLE_VALIDATION_LOOP = True  # Enable validation loop by default
 MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
 MCP4BIBFRAME_DOCS_ENABLED = True  # Set to False to disable doc integration
 def test_validator_functionality():
     """Test if the validator is actually working"""
     if not VALIDATOR_AVAILABLE:
@@ -165,6 +169,30 @@ def query_bibframe_docs(tool_name: str, params: dict, timeout: int = 10) -> Opti
     return None
 def extract_bibframe_terms_from_errors(validation_results: str) -> dict:
     """
     Extract BibFrame properties and classes mentioned in validation errors.
@@ -246,7 +274,8 @@ def fetch_bibframe_guidance(validation_results: str, rdf_content: str) -> str:
         # Query information for key properties
         for prop in terms['properties'][:3]:  # Limit queries
-            result = query_bibframe_docs("get_property_info", {"property_uri": prop})
             if result and isinstance(result, dict):
                 guidance_parts.append(f"\n**{result.get('label', prop)}** ({prop}):")
                 if 'definition' in result:
@@ -260,7 +289,8 @@ def fetch_bibframe_guidance(validation_results: str, rdf_content: str) -> str:
         # Query information for key classes
         for cls in terms['classes'][:2]:  # Limit queries
-            result = query_bibframe_docs("get_class_info", {"class_name": cls})
             if result and isinstance(result, dict):
                 guidance_parts.append(f"\n**{result.get('label', cls)}** class:")
                 if 'definition' in result:
@@ -271,7 +301,7 @@ def fetch_bibframe_guidance(validation_results: str, rdf_content: str) -> str:
         # If we found AdminMetadata issues, get specific usage guidance
         if any(term in validation_results.lower() for term in ['adminmetadata', 'assigner', '->bf:assigner']):
-            result = query_bibframe_docs("get_property_usage", {
                 "property_name": "assigner",
                 "class_name": "AdminMetadata"
             })
@@ -1052,6 +1082,155 @@ def fix_common_rdf_errors(rdf_xml: str) -> str:
     return rdf_xml
 def get_ai_correction(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
     """
     Generate AI-powered corrected RDF/XML based on validation errors.
@@ -1318,6 +1497,84 @@ Output ONLY valid RDF/XML following these rules:
 {generate_manual_correction_hints(validation_results, rdf_content)}"""
 def generate_manual_suggestions(validation_results: str) -> str:
     """Generate generic, pattern-based suggestions when AI is not available.
@@ -1678,7 +1935,7 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
             # Pass filtered results to AI functions
             suggestions = get_ai_suggestions(filtered_results, rdf_content, include_warnings)
             steps_log.append("Requested AI suggestions for concise guidance")
-            corrected_rdf = get_ai_correction(
                 filtered_results,
                 rdf_content,
                 template,

 MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
 MCP4BIBFRAME_DOCS_ENABLED = True  # Set to False to disable doc integration
+# Cache BibFrame documentation responses to avoid repeated network calls
+BIBFRAME_DOCS_CACHE: Dict[str, tuple[Any, float]] = {}
+BIBFRAME_DOCS_CACHE_TTL = 3600  # seconds
 def test_validator_functionality():
     """Test if the validator is actually working"""
     if not VALIDATOR_AVAILABLE:
     return None
+def query_bibframe_docs_cached(tool_name: str, params: dict, timeout: int = 10) -> Optional[dict]:
+    """Cached wrapper around ``query_bibframe_docs`` to avoid repeated HTTP calls."""
+    if not MCP4BIBFRAME_DOCS_ENABLED:
+        return None
+    try:
+        cache_key = f"{tool_name}:{json.dumps(params, sort_keys=True)}"
+    except TypeError:
+        cache_key = f"{tool_name}:{str(params)}"
+    cached = BIBFRAME_DOCS_CACHE.get(cache_key)
+    if cached:
+        payload, timestamp = cached
+        if time.time() - timestamp < BIBFRAME_DOCS_CACHE_TTL:
+            logger.debug(f"Using cached BibFrame docs response for {cache_key}")
+            return payload
+    response = query_bibframe_docs(tool_name, params, timeout)
+    if response is not None:
+        BIBFRAME_DOCS_CACHE[cache_key] = (response, time.time())
+    return response
 def extract_bibframe_terms_from_errors(validation_results: str) -> dict:
     """
     Extract BibFrame properties and classes mentioned in validation errors.
         # Query information for key properties
         for prop in terms['properties'][:3]:  # Limit queries
+            prop_uri = _resolve_bibframe_uri(prop)
+            result = query_bibframe_docs_cached("get_property_info", {"property_uri": prop_uri})
             if result and isinstance(result, dict):
                 guidance_parts.append(f"\n**{result.get('label', prop)}** ({prop}):")
                 if 'definition' in result:
         # Query information for key classes
         for cls in terms['classes'][:2]:  # Limit queries
+            cls_uri = _resolve_bibframe_uri(cls)
+            result = query_bibframe_docs_cached("get_class_info", {"class_uri": cls_uri})
             if result and isinstance(result, dict):
                 guidance_parts.append(f"\n**{result.get('label', cls)}** class:")
                 if 'definition' in result:
         # If we found AdminMetadata issues, get specific usage guidance
         if any(term in validation_results.lower() for term in ['adminmetadata', 'assigner', '->bf:assigner']):
+            result = query_bibframe_docs_cached("get_property_usage", {
                 "property_name": "assigner",
                 "class_name": "AdminMetadata"
             })
     return rdf_xml
+def extract_error_focus_points(validation_results: str) -> Dict[str, List[str]]:
+    """Identify the specific focus nodes and properties mentioned in validation errors."""
+    import re
+    focus = {
+        "properties": [],
+        "focus_nodes": [],
+        "missing_properties": [],
+        "classes": [],
+    }
+    if not validation_results:
+        return focus
+    property_set = set()
+    missing_set = set()
+    node_set = set()
+    for match in re.finditer(r"Focus Node:\s*(?:<)?([^\s>]+)(?:>)?", validation_results):
+        node_set.add(match.group(1))
+    for match in re.finditer(r"Result Path:\s*(?:http://[^/]+/)?([A-Za-z]+)", validation_results):
+        property_set.add(match.group(1))
+    for match in re.finditer(r"Less than \d+ values on .*->bf:([A-Za-z]+)", validation_results):
+        missing_set.add(match.group(1))
+    focus["properties"] = sorted(property_set)
+    focus["focus_nodes"] = sorted(node_set)
+    focus["missing_properties"] = sorted(missing_set)
+    return focus
+def _resolve_bibframe_uri(name: str) -> str:
+    if not name:
+        return name
+    if name.startswith("http://") or name.startswith("https://"):
+        return name
+    if ":" in name:
+        prefix, local = name.split(":", 1)
+        if prefix == "bf":
+            return f"http://id.loc.gov/ontologies/bibframe/{local}"
+    return f"http://id.loc.gov/ontologies/bibframe/{name}"
+def get_targeted_bibframe_guidance(properties: List[str], classes: List[str]) -> Dict[str, dict]:
+    """Fetch BibFrame documentation for only the specified properties/classes."""
+    guidance: Dict[str, dict] = {}
+    if not MCP4BIBFRAME_DOCS_ENABLED:
+        return guidance
+    for prop in properties[:5]:
+        prop_uri = _resolve_bibframe_uri(prop)
+        result = query_bibframe_docs_cached("get_property_info", {"property_uri": prop_uri}, timeout=5)
+        if result:
+            guidance[prop] = result
+    for cls in classes[:5]:
+        cls_uri = _resolve_bibframe_uri(cls)
+        result = query_bibframe_docs_cached("get_class_info", {"class_uri": cls_uri}, timeout=5)
+        if result:
+            guidance[cls] = result
+    return guidance
+def generate_property_specific_fix(property_name: str, guidance: Optional[dict] = None) -> str:
+    """Generate a BibFrame-compliant snippet for a specific missing property."""
+    guidance = guidance or {}
+    prop = property_name.lower() if property_name else ""
+    if prop == "title":
+        return """<bf:title>
+    <bf:Title>
+        <bf:mainTitle>PLACEHOLDER_TITLE</bf:mainTitle>
+    </bf:Title>
+</bf:title>"""
+    if prop == "language":
+        return """<bf:language>
+    <bf:Language rdf:about="http://id.loc.gov/vocabulary/languages/eng">
+        <rdfs:label xml:lang="en">English</rdfs:label>
+        <bf:code rdf:datatype="http://www.w3.org/2001/XMLSchema#string">eng</bf:code>
+    </bf:Language>
+</bf:language>"""
+    if prop == "content":
+        return """<bf:content>
+    <bf:Content rdf:about="http://id.loc.gov/vocabulary/contentTypes/txt">
+        <rdfs:label>text</rdfs:label>
+        <bf:code>txt</bf:code>
+    </bf:Content>
+</bf:content>"""
+    if prop == "contribution":
+        return """<bf:contribution>
+    <bf:PrimaryContribution>
+        <bf:agent>
+            <bf:Agent>
+                <rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Person"/>
+                <rdfs:label>Author Name</rdfs:label>
+            </bf:Agent>
+        </bf:agent>
+        <bf:role>
+            <bf:Role rdf:about="http://id.loc.gov/vocabulary/relators/aut">
+                <rdfs:label>author</rdfs:label>
+                <bf:code>aut</bf:code>
+            </bf:Role>
+        </bf:role>
+    </bf:PrimaryContribution>
+</bf:contribution>"""
+    if prop == "classification":
+        return """<bf:classification>
+    <bf:ClassificationLcc>
+        <bf:classificationPortion>TT820</bf:classificationPortion>
+        <bf:itemPortion>.B877 2002</bf:itemPortion>
+        <bf:assigner>
+            <bf:Organization rdf:about="http://id.loc.gov/vocabulary/organizations/dlc">
+                <rdfs:label>United States, Library of Congress</rdfs:label>
+            </bf:Organization>
+        </bf:assigner>
+    </bf:ClassificationLcc>
+</bf:classification>"""
+    if prop == "adminmetadata":
+        return """<bf:adminMetadata>
+    <bf:AdminMetadata>
+        <bf:status>
+            <bf:Status rdf:about="http://id.loc.gov/vocabulary/mstatus/n">
+                <rdfs:label>new</rdfs:label>
+                <bf:code>n</bf:code>
+            </bf:Status>
+        </bf:status>
+        <bf:date rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2024-01-01</bf:date>
+        <bf:agent>
+            <bf:Agent rdf:about="http://id.loc.gov/vocabulary/organizations/dlc">
+                <rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Organization"/>
+                <rdfs:label>United States, Library of Congress</rdfs:label>
+            </bf:Agent>
+        </bf:agent>
+    </bf:AdminMetadata>
+</bf:adminMetadata>"""
+    # Fallback: simple literal placeholder
+    return f"<bf:{property_name}>PLACEHOLDER_VALUE</bf:{property_name}>"
 def get_ai_correction(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
     """
     Generate AI-powered corrected RDF/XML based on validation errors.
 {generate_manual_correction_hints(validation_results, rdf_content)}"""
+def get_ai_correction_targeted(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
+    """Fast path that attempts structured quick fixes before invoking the full AI loop."""
+    focus_points = extract_error_focus_points(validation_results)
+    missing_props = focus_points.get("missing_properties", [])
+    if steps_log is not None:
+        steps_log.append(f"Targeted fix: detected {len(missing_props)} missing properties")
+        if missing_props:
+            preview = ", ".join(missing_props[:5])
+            if len(missing_props) > 5:
+                preview += ", ..."
+            steps_log.append(f"Missing list: {preview}")
+    working_rdf = rdf_content
+    quick_fix_attempted = False
+    if missing_props and len(missing_props) <= 5:
+        guidance = get_targeted_bibframe_guidance(missing_props, focus_points.get("classes", []))
+        if steps_log is not None:
+            steps_log.append(f"Retrieved guidance entries: {len(guidance)}")
+        import re
+        def _inject_snippets(match: re.Match) -> str:
+            nonlocal quick_fix_attempted
+            opening, inner, closing = match.groups()
+            new_bits = []
+            for prop in missing_props:
+                if f"<bf:{prop}" not in inner:
+                    snippet = generate_property_specific_fix(prop, guidance.get(prop))
+                    new_bits.append(snippet)
+            if not new_bits:
+                return match.group(0)
+            quick_fix_attempted = True
+            if steps_log is not None:
+                steps_log.append(f"Injected {len(new_bits)} snippets into {match.group(1).split()[0][1:]}")
+            combined = opening + inner
+            if not inner.endswith("\n"):
+                combined += "\n"
+            combined += "    " + "\n    ".join(new_bits) + "\n" + closing
+            return combined
+        work_pattern = re.compile(r"(<bf:Work[^>]*>)([\s\S]*?)(</bf:Work>)")
+        instance_pattern = re.compile(r"(<bf:Instance[^>]*>)([\s\S]*?)(</bf:Instance>)")
+        if work_pattern.search(working_rdf):
+            working_rdf = work_pattern.sub(_inject_snippets, working_rdf, count=1)
+        elif instance_pattern.search(working_rdf):
+            working_rdf = instance_pattern.sub(_inject_snippets, working_rdf, count=1)
+        if quick_fix_attempted and VALIDATOR_AVAILABLE:
+            try:
+                conforms, new_results = validate_rdf(working_rdf.encode('utf-8'), template)
+                if conforms:
+                    if steps_log is not None:
+                        steps_log.append("Quick fix succeeded; validation now passes")
+                    return working_rdf
+                else:
+                    if steps_log is not None:
+                        steps_log.append("Quick fix incomplete; falling back to AI loop")
+                    validation_results = new_results or validation_results
+            except Exception as quick_err:
+                if steps_log is not None:
+                    steps_log.append(f"Quick fix validation error: {quick_err}; using AI fallback")
+    return get_ai_correction(
+        validation_results,
+        working_rdf,
+        template,
+        max_attempts=max_attempts,
+        include_warnings=include_warnings,
+        enable_validation_loop=enable_validation_loop,
+        steps_log=steps_log,
+    )
 def generate_manual_suggestions(validation_results: str) -> str:
     """Generate generic, pattern-based suggestions when AI is not available.
             # Pass filtered results to AI functions
             suggestions = get_ai_suggestions(filtered_results, rdf_content, include_warnings)
             steps_log.append("Requested AI suggestions for concise guidance")
+            corrected_rdf = get_ai_correction_targeted(
                 filtered_results,
                 rdf_content,
                 template,