RDF Validation Deployment commited on
Commit
af9e2c1
·
1 Parent(s): 48a9d7d

streamline validation

Browse files
Files changed (1) hide show
  1. app.py +261 -4
app.py CHANGED
@@ -71,6 +71,10 @@ ENABLE_VALIDATION_LOOP = True # Enable validation loop by default
71
  MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
72
  MCP4BIBFRAME_DOCS_ENABLED = True # Set to False to disable doc integration
73
 
 
 
 
 
74
  def test_validator_functionality():
75
  """Test if the validator is actually working"""
76
  if not VALIDATOR_AVAILABLE:
@@ -165,6 +169,30 @@ def query_bibframe_docs(tool_name: str, params: dict, timeout: int = 10) -> Opti
165
 
166
  return None
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  def extract_bibframe_terms_from_errors(validation_results: str) -> dict:
169
  """
170
  Extract BibFrame properties and classes mentioned in validation errors.
@@ -246,7 +274,8 @@ def fetch_bibframe_guidance(validation_results: str, rdf_content: str) -> str:
246
 
247
  # Query information for key properties
248
  for prop in terms['properties'][:3]: # Limit queries
249
- result = query_bibframe_docs("get_property_info", {"property_uri": prop})
 
250
  if result and isinstance(result, dict):
251
  guidance_parts.append(f"\n**{result.get('label', prop)}** ({prop}):")
252
  if 'definition' in result:
@@ -260,7 +289,8 @@ def fetch_bibframe_guidance(validation_results: str, rdf_content: str) -> str:
260
 
261
  # Query information for key classes
262
  for cls in terms['classes'][:2]: # Limit queries
263
- result = query_bibframe_docs("get_class_info", {"class_name": cls})
 
264
  if result and isinstance(result, dict):
265
  guidance_parts.append(f"\n**{result.get('label', cls)}** class:")
266
  if 'definition' in result:
@@ -271,7 +301,7 @@ def fetch_bibframe_guidance(validation_results: str, rdf_content: str) -> str:
271
 
272
  # If we found AdminMetadata issues, get specific usage guidance
273
  if any(term in validation_results.lower() for term in ['adminmetadata', 'assigner', '->bf:assigner']):
274
- result = query_bibframe_docs("get_property_usage", {
275
  "property_name": "assigner",
276
  "class_name": "AdminMetadata"
277
  })
@@ -1052,6 +1082,155 @@ def fix_common_rdf_errors(rdf_xml: str) -> str:
1052
 
1053
  return rdf_xml
1054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1055
  def get_ai_correction(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
1056
  """
1057
  Generate AI-powered corrected RDF/XML based on validation errors.
@@ -1318,6 +1497,84 @@ Output ONLY valid RDF/XML following these rules:
1318
 
1319
  {generate_manual_correction_hints(validation_results, rdf_content)}"""
1320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1321
  def generate_manual_suggestions(validation_results: str) -> str:
1322
  """Generate generic, pattern-based suggestions when AI is not available.
1323
 
@@ -1678,7 +1935,7 @@ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True,
1678
  # Pass filtered results to AI functions
1679
  suggestions = get_ai_suggestions(filtered_results, rdf_content, include_warnings)
1680
  steps_log.append("Requested AI suggestions for concise guidance")
1681
- corrected_rdf = get_ai_correction(
1682
  filtered_results,
1683
  rdf_content,
1684
  template,
 
71
  MCP4BIBFRAME_DOCS_URL = "https://jimfhahn-mcp4bibframe-docs.hf.space/api/mcp"
72
  MCP4BIBFRAME_DOCS_ENABLED = True # Set to False to disable doc integration
73
 
74
+ # Cache BibFrame documentation responses to avoid repeated network calls
75
+ BIBFRAME_DOCS_CACHE: Dict[str, tuple[Any, float]] = {}
76
+ BIBFRAME_DOCS_CACHE_TTL = 3600 # seconds
77
+
78
  def test_validator_functionality():
79
  """Test if the validator is actually working"""
80
  if not VALIDATOR_AVAILABLE:
 
169
 
170
  return None
171
 
172
+
173
+ def query_bibframe_docs_cached(tool_name: str, params: dict, timeout: int = 10) -> Optional[dict]:
174
+ """Cached wrapper around ``query_bibframe_docs`` to avoid repeated HTTP calls."""
175
+ if not MCP4BIBFRAME_DOCS_ENABLED:
176
+ return None
177
+
178
+ try:
179
+ cache_key = f"{tool_name}:{json.dumps(params, sort_keys=True)}"
180
+ except TypeError:
181
+ cache_key = f"{tool_name}:{str(params)}"
182
+
183
+ cached = BIBFRAME_DOCS_CACHE.get(cache_key)
184
+ if cached:
185
+ payload, timestamp = cached
186
+ if time.time() - timestamp < BIBFRAME_DOCS_CACHE_TTL:
187
+ logger.debug(f"Using cached BibFrame docs response for {cache_key}")
188
+ return payload
189
+
190
+ response = query_bibframe_docs(tool_name, params, timeout)
191
+ if response is not None:
192
+ BIBFRAME_DOCS_CACHE[cache_key] = (response, time.time())
193
+
194
+ return response
195
+
196
  def extract_bibframe_terms_from_errors(validation_results: str) -> dict:
197
  """
198
  Extract BibFrame properties and classes mentioned in validation errors.
 
274
 
275
  # Query information for key properties
276
  for prop in terms['properties'][:3]: # Limit queries
277
+ prop_uri = _resolve_bibframe_uri(prop)
278
+ result = query_bibframe_docs_cached("get_property_info", {"property_uri": prop_uri})
279
  if result and isinstance(result, dict):
280
  guidance_parts.append(f"\n**{result.get('label', prop)}** ({prop}):")
281
  if 'definition' in result:
 
289
 
290
  # Query information for key classes
291
  for cls in terms['classes'][:2]: # Limit queries
292
+ cls_uri = _resolve_bibframe_uri(cls)
293
+ result = query_bibframe_docs_cached("get_class_info", {"class_uri": cls_uri})
294
  if result and isinstance(result, dict):
295
  guidance_parts.append(f"\n**{result.get('label', cls)}** class:")
296
  if 'definition' in result:
 
301
 
302
  # If we found AdminMetadata issues, get specific usage guidance
303
  if any(term in validation_results.lower() for term in ['adminmetadata', 'assigner', '->bf:assigner']):
304
+ result = query_bibframe_docs_cached("get_property_usage", {
305
  "property_name": "assigner",
306
  "class_name": "AdminMetadata"
307
  })
 
1082
 
1083
  return rdf_xml
1084
 
1085
+
1086
+ def extract_error_focus_points(validation_results: str) -> Dict[str, List[str]]:
1087
+ """Identify the specific focus nodes and properties mentioned in validation errors."""
1088
+ import re
1089
+
1090
+ focus = {
1091
+ "properties": [],
1092
+ "focus_nodes": [],
1093
+ "missing_properties": [],
1094
+ "classes": [],
1095
+ }
1096
+
1097
+ if not validation_results:
1098
+ return focus
1099
+
1100
+ property_set = set()
1101
+ missing_set = set()
1102
+ node_set = set()
1103
+
1104
+ for match in re.finditer(r"Focus Node:\s*(?:<)?([^\s>]+)(?:>)?", validation_results):
1105
+ node_set.add(match.group(1))
1106
+
1107
+ for match in re.finditer(r"Result Path:\s*(?:http://[^/]+/)?([A-Za-z]+)", validation_results):
1108
+ property_set.add(match.group(1))
1109
+
1110
+ for match in re.finditer(r"Less than \d+ values on .*->bf:([A-Za-z]+)", validation_results):
1111
+ missing_set.add(match.group(1))
1112
+
1113
+ focus["properties"] = sorted(property_set)
1114
+ focus["focus_nodes"] = sorted(node_set)
1115
+ focus["missing_properties"] = sorted(missing_set)
1116
+ return focus
1117
+
1118
+
1119
+ def _resolve_bibframe_uri(name: str) -> str:
1120
+ if not name:
1121
+ return name
1122
+ if name.startswith("http://") or name.startswith("https://"):
1123
+ return name
1124
+ if ":" in name:
1125
+ prefix, local = name.split(":", 1)
1126
+ if prefix == "bf":
1127
+ return f"http://id.loc.gov/ontologies/bibframe/{local}"
1128
+ return f"http://id.loc.gov/ontologies/bibframe/{name}"
1129
+
1130
+
1131
+ def get_targeted_bibframe_guidance(properties: List[str], classes: List[str]) -> Dict[str, dict]:
1132
+ """Fetch BibFrame documentation for only the specified properties/classes."""
1133
+ guidance: Dict[str, dict] = {}
1134
+
1135
+ if not MCP4BIBFRAME_DOCS_ENABLED:
1136
+ return guidance
1137
+
1138
+ for prop in properties[:5]:
1139
+ prop_uri = _resolve_bibframe_uri(prop)
1140
+ result = query_bibframe_docs_cached("get_property_info", {"property_uri": prop_uri}, timeout=5)
1141
+ if result:
1142
+ guidance[prop] = result
1143
+
1144
+ for cls in classes[:5]:
1145
+ cls_uri = _resolve_bibframe_uri(cls)
1146
+ result = query_bibframe_docs_cached("get_class_info", {"class_uri": cls_uri}, timeout=5)
1147
+ if result:
1148
+ guidance[cls] = result
1149
+
1150
+ return guidance
1151
+
1152
+
1153
+ def generate_property_specific_fix(property_name: str, guidance: Optional[dict] = None) -> str:
1154
+ """Generate a BibFrame-compliant snippet for a specific missing property."""
1155
+ guidance = guidance or {}
1156
+ prop = property_name.lower() if property_name else ""
1157
+
1158
+ if prop == "title":
1159
+ return """<bf:title>
1160
+ <bf:Title>
1161
+ <bf:mainTitle>PLACEHOLDER_TITLE</bf:mainTitle>
1162
+ </bf:Title>
1163
+ </bf:title>"""
1164
+
1165
+ if prop == "language":
1166
+ return """<bf:language>
1167
+ <bf:Language rdf:about="http://id.loc.gov/vocabulary/languages/eng">
1168
+ <rdfs:label xml:lang="en">English</rdfs:label>
1169
+ <bf:code rdf:datatype="http://www.w3.org/2001/XMLSchema#string">eng</bf:code>
1170
+ </bf:Language>
1171
+ </bf:language>"""
1172
+
1173
+ if prop == "content":
1174
+ return """<bf:content>
1175
+ <bf:Content rdf:about="http://id.loc.gov/vocabulary/contentTypes/txt">
1176
+ <rdfs:label>text</rdfs:label>
1177
+ <bf:code>txt</bf:code>
1178
+ </bf:Content>
1179
+ </bf:content>"""
1180
+
1181
+ if prop == "contribution":
1182
+ return """<bf:contribution>
1183
+ <bf:PrimaryContribution>
1184
+ <bf:agent>
1185
+ <bf:Agent>
1186
+ <rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Person"/>
1187
+ <rdfs:label>Author Name</rdfs:label>
1188
+ </bf:Agent>
1189
+ </bf:agent>
1190
+ <bf:role>
1191
+ <bf:Role rdf:about="http://id.loc.gov/vocabulary/relators/aut">
1192
+ <rdfs:label>author</rdfs:label>
1193
+ <bf:code>aut</bf:code>
1194
+ </bf:Role>
1195
+ </bf:role>
1196
+ </bf:PrimaryContribution>
1197
+ </bf:contribution>"""
1198
+
1199
+ if prop == "classification":
1200
+ return """<bf:classification>
1201
+ <bf:ClassificationLcc>
1202
+ <bf:classificationPortion>TT820</bf:classificationPortion>
1203
+ <bf:itemPortion>.B877 2002</bf:itemPortion>
1204
+ <bf:assigner>
1205
+ <bf:Organization rdf:about="http://id.loc.gov/vocabulary/organizations/dlc">
1206
+ <rdfs:label>United States, Library of Congress</rdfs:label>
1207
+ </bf:Organization>
1208
+ </bf:assigner>
1209
+ </bf:ClassificationLcc>
1210
+ </bf:classification>"""
1211
+
1212
+ if prop == "adminmetadata":
1213
+ return """<bf:adminMetadata>
1214
+ <bf:AdminMetadata>
1215
+ <bf:status>
1216
+ <bf:Status rdf:about="http://id.loc.gov/vocabulary/mstatus/n">
1217
+ <rdfs:label>new</rdfs:label>
1218
+ <bf:code>n</bf:code>
1219
+ </bf:Status>
1220
+ </bf:status>
1221
+ <bf:date rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2024-01-01</bf:date>
1222
+ <bf:agent>
1223
+ <bf:Agent rdf:about="http://id.loc.gov/vocabulary/organizations/dlc">
1224
+ <rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Organization"/>
1225
+ <rdfs:label>United States, Library of Congress</rdfs:label>
1226
+ </bf:Agent>
1227
+ </bf:agent>
1228
+ </bf:AdminMetadata>
1229
+ </bf:adminMetadata>"""
1230
+
1231
+ # Fallback: simple literal placeholder
1232
+ return f"<bf:{property_name}>PLACEHOLDER_VALUE</bf:{property_name}>"
1233
+
1234
  def get_ai_correction(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
1235
  """
1236
  Generate AI-powered corrected RDF/XML based on validation errors.
 
1497
 
1498
  {generate_manual_correction_hints(validation_results, rdf_content)}"""
1499
 
1500
+
1501
+ def get_ai_correction_targeted(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
1502
+ """Fast path that attempts structured quick fixes before invoking the full AI loop."""
1503
+
1504
+ focus_points = extract_error_focus_points(validation_results)
1505
+ missing_props = focus_points.get("missing_properties", [])
1506
+
1507
+ if steps_log is not None:
1508
+ steps_log.append(f"Targeted fix: detected {len(missing_props)} missing properties")
1509
+ if missing_props:
1510
+ preview = ", ".join(missing_props[:5])
1511
+ if len(missing_props) > 5:
1512
+ preview += ", ..."
1513
+ steps_log.append(f"Missing list: {preview}")
1514
+
1515
+ working_rdf = rdf_content
1516
+ quick_fix_attempted = False
1517
+
1518
+ if missing_props and len(missing_props) <= 5:
1519
+ guidance = get_targeted_bibframe_guidance(missing_props, focus_points.get("classes", []))
1520
+ if steps_log is not None:
1521
+ steps_log.append(f"Retrieved guidance entries: {len(guidance)}")
1522
+
1523
+ import re
1524
+
1525
+ def _inject_snippets(match: re.Match) -> str:
1526
+ nonlocal quick_fix_attempted
1527
+ opening, inner, closing = match.groups()
1528
+ new_bits = []
1529
+ for prop in missing_props:
1530
+ if f"<bf:{prop}" not in inner:
1531
+ snippet = generate_property_specific_fix(prop, guidance.get(prop))
1532
+ new_bits.append(snippet)
1533
+ if not new_bits:
1534
+ return match.group(0)
1535
+ quick_fix_attempted = True
1536
+ if steps_log is not None:
1537
+ steps_log.append(f"Injected {len(new_bits)} snippets into {match.group(1).split()[0][1:]}")
1538
+ combined = opening + inner
1539
+ if not inner.endswith("\n"):
1540
+ combined += "\n"
1541
+ combined += " " + "\n ".join(new_bits) + "\n" + closing
1542
+ return combined
1543
+
1544
+ work_pattern = re.compile(r"(<bf:Work[^>]*>)([\s\S]*?)(</bf:Work>)")
1545
+ instance_pattern = re.compile(r"(<bf:Instance[^>]*>)([\s\S]*?)(</bf:Instance>)")
1546
+
1547
+ if work_pattern.search(working_rdf):
1548
+ working_rdf = work_pattern.sub(_inject_snippets, working_rdf, count=1)
1549
+ elif instance_pattern.search(working_rdf):
1550
+ working_rdf = instance_pattern.sub(_inject_snippets, working_rdf, count=1)
1551
+
1552
+ if quick_fix_attempted and VALIDATOR_AVAILABLE:
1553
+ try:
1554
+ conforms, new_results = validate_rdf(working_rdf.encode('utf-8'), template)
1555
+ if conforms:
1556
+ if steps_log is not None:
1557
+ steps_log.append("Quick fix succeeded; validation now passes")
1558
+ return working_rdf
1559
+ else:
1560
+ if steps_log is not None:
1561
+ steps_log.append("Quick fix incomplete; falling back to AI loop")
1562
+ validation_results = new_results or validation_results
1563
+ except Exception as quick_err:
1564
+ if steps_log is not None:
1565
+ steps_log.append(f"Quick fix validation error: {quick_err}; using AI fallback")
1566
+
1567
+ return get_ai_correction(
1568
+ validation_results,
1569
+ working_rdf,
1570
+ template,
1571
+ max_attempts=max_attempts,
1572
+ include_warnings=include_warnings,
1573
+ enable_validation_loop=enable_validation_loop,
1574
+ steps_log=steps_log,
1575
+ )
1576
+
1577
+
1578
  def generate_manual_suggestions(validation_results: str) -> str:
1579
  """Generate generic, pattern-based suggestions when AI is not available.
1580
 
 
1935
  # Pass filtered results to AI functions
1936
  suggestions = get_ai_suggestions(filtered_results, rdf_content, include_warnings)
1937
  steps_log.append("Requested AI suggestions for concise guidance")
1938
+ corrected_rdf = get_ai_correction_targeted(
1939
  filtered_results,
1940
  rdf_content,
1941
  template,