RDF Validation Deployment
commited on
Commit
·
48a9d7d
1
Parent(s):
df23939
rdf fix
Browse files
app.py
CHANGED
|
@@ -306,42 +306,83 @@ def get_openai_client():
|
|
| 306 |
timeout=120.0 # Increase timeout for cold starts
|
| 307 |
)
|
| 308 |
|
| 309 |
-
# Sample RDF data for examples
|
| 310 |
SAMPLE_VALID_RDF = '''<?xml version="1.0" encoding="UTF-8"?>
|
| 311 |
-
<rdf:RDF xmlns:
|
| 312 |
-
xmlns:
|
| 313 |
-
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
|
|
|
|
|
|
|
| 314 |
|
| 315 |
<bf:Work rdf:about="http://example.org/work/1">
|
| 316 |
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Text"/>
|
|
|
|
|
|
|
| 317 |
<bf:title>
|
| 318 |
<bf:Title>
|
| 319 |
-
<bf:mainTitle>
|
|
|
|
| 320 |
</bf:Title>
|
| 321 |
</bf:title>
|
|
|
|
| 322 |
<bf:contribution>
|
| 323 |
-
<bf:
|
| 324 |
<bf:agent>
|
| 325 |
-
<bf:Agent>
|
| 326 |
-
<
|
|
|
|
| 327 |
</bf:Agent>
|
| 328 |
</bf:agent>
|
| 329 |
<bf:role>
|
| 330 |
-
<bf:Role>
|
| 331 |
-
<rdfs:label>
|
|
|
|
| 332 |
</bf:Role>
|
| 333 |
</bf:role>
|
| 334 |
-
</bf:
|
| 335 |
</bf:contribution>
|
| 336 |
-
|
| 337 |
-
<bf:
|
| 338 |
-
|
| 339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
<bf:assigner>
|
| 341 |
-
<bf:Organization>
|
| 342 |
-
<rdfs:label>
|
| 343 |
</bf:Organization>
|
| 344 |
</bf:assigner>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
</bf:AdminMetadata>
|
| 346 |
</bf:adminMetadata>
|
| 347 |
</bf:Work>
|
|
@@ -360,6 +401,127 @@ SAMPLE_INVALID_RDF = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
| 360 |
</bf:Work>
|
| 361 |
</rdf:RDF>'''
|
| 362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
# MCP Server Tools (can be used independently)
|
| 364 |
def validate_rdf_tool(rdf_content: str, template: str = "monograph") -> dict:
|
| 365 |
"""
|
|
@@ -838,6 +1000,58 @@ def extract_rdf_from_response(response: str) -> str:
|
|
| 838 |
# If no code blocks found, return the response as-is
|
| 839 |
return response
|
| 840 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
def get_ai_correction(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
|
| 842 |
"""
|
| 843 |
Generate AI-powered corrected RDF/XML based on validation errors.
|
|
@@ -950,6 +1164,39 @@ Rules:
|
|
| 950 |
- Else if <bf:descriptionModifier rdf:resource=\"...\"/> exists, add <bf:assigner rdf:resource=\"...\"/> with the SAME URI.
|
| 951 |
- Else if a <bf:identifiedBy> block contains <bf:assigner rdf:resource=\"...\"/>, copy that URI to a TOP-LEVEL <bf:assigner>.
|
| 952 |
Keep all existing content; only add missing <bf:assigner> where required.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 953 |
"""
|
| 954 |
|
| 955 |
prompt = f"""You are an expert in RDF/XML and BibFrame cataloging. Fix the following RDF/XML based on the validation errors and official BibFrame documentation.
|
|
@@ -957,6 +1204,8 @@ Keep all existing content; only add missing <bf:assigner> where required.
|
|
| 957 |
{severity_instruction}
|
| 958 |
{admin_guidance}
|
| 959 |
{guidance_section}
|
|
|
|
|
|
|
| 960 |
|
| 961 |
Validation Errors:
|
| 962 |
{validation_results}
|
|
@@ -966,21 +1215,30 @@ Original RDF/XML:
|
|
| 966 |
|
| 967 |
{f"Previous attempt {attempt} still had validation errors. Please fix ALL issues this time." if attempt > 0 else ""}
|
| 968 |
|
| 969 |
-
|
| 970 |
-
|
| 971 |
-
|
| 972 |
-
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
- Fix any syntax or structural issues"""
|
| 976 |
|
| 977 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 978 |
chat_completion = client.chat.completions.create(
|
| 979 |
model=HF_MODEL,
|
| 980 |
messages=[
|
| 981 |
{
|
| 982 |
"role": "system",
|
| 983 |
-
"content":
|
| 984 |
},
|
| 985 |
{
|
| 986 |
"role": "user",
|
|
@@ -994,11 +1252,14 @@ Please provide the corrected RDF/XML that addresses all validation issues.
|
|
| 994 |
|
| 995 |
corrected_rdf = chat_completion.choices[0].message.content.strip()
|
| 996 |
if steps_log is not None:
|
| 997 |
-
steps_log.append(f"Attempt {attempt_no}: model responded; extracting
|
| 998 |
|
| 999 |
# Extract RDF content if it's wrapped in code blocks
|
| 1000 |
corrected_rdf = extract_rdf_from_response(corrected_rdf)
|
| 1001 |
|
|
|
|
|
|
|
|
|
|
| 1002 |
# Only validate if we have the validator and haven't hit timeout
|
| 1003 |
if VALIDATOR_AVAILABLE and (time.time() - start_time < timeout - 10):
|
| 1004 |
try:
|
|
|
|
| 306 |
timeout=120.0 # Increase timeout for cold starts
|
| 307 |
)
|
| 308 |
|
| 309 |
+
# Sample RDF data for examples (based on real Library of Congress BibFrame)
|
| 310 |
SAMPLE_VALID_RDF = '''<?xml version="1.0" encoding="UTF-8"?>
|
| 311 |
+
<rdf:RDF xmlns:bf="http://id.loc.gov/ontologies/bibframe/"
|
| 312 |
+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
| 313 |
+
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
|
| 314 |
+
xmlns:bflc="http://id.loc.gov/ontologies/bflc/"
|
| 315 |
+
xmlns:madsrdf="http://www.loc.gov/mads/rdf/v1#">
|
| 316 |
|
| 317 |
<bf:Work rdf:about="http://example.org/work/1">
|
| 318 |
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Text"/>
|
| 319 |
+
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Monograph"/>
|
| 320 |
+
|
| 321 |
<bf:title>
|
| 322 |
<bf:Title>
|
| 323 |
+
<bf:mainTitle>The knitter's handy book of patterns</bf:mainTitle>
|
| 324 |
+
<bf:subtitle>basic designs in multiple sizes & gauges</bf:subtitle>
|
| 325 |
</bf:Title>
|
| 326 |
</bf:title>
|
| 327 |
+
|
| 328 |
<bf:contribution>
|
| 329 |
+
<bf:PrimaryContribution>
|
| 330 |
<bf:agent>
|
| 331 |
+
<bf:Agent rdf:about="http://id.loc.gov/rwo/agents/n2001017606">
|
| 332 |
+
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Person"/>
|
| 333 |
+
<rdfs:label>Budd, Ann, 1956-</rdfs:label>
|
| 334 |
</bf:Agent>
|
| 335 |
</bf:agent>
|
| 336 |
<bf:role>
|
| 337 |
+
<bf:Role rdf:about="http://id.loc.gov/vocabulary/relators/aut">
|
| 338 |
+
<rdfs:label>author</rdfs:label>
|
| 339 |
+
<bf:code>aut</bf:code>
|
| 340 |
</bf:Role>
|
| 341 |
</bf:role>
|
| 342 |
+
</bf:PrimaryContribution>
|
| 343 |
</bf:contribution>
|
| 344 |
+
|
| 345 |
+
<bf:language>
|
| 346 |
+
<bf:Language rdf:about="http://id.loc.gov/vocabulary/languages/eng">
|
| 347 |
+
<rdfs:label xml:lang="en">English</rdfs:label>
|
| 348 |
+
<bf:code rdf:datatype="http://www.w3.org/2001/XMLSchema#string">eng</bf:code>
|
| 349 |
+
</bf:Language>
|
| 350 |
+
</bf:language>
|
| 351 |
+
|
| 352 |
+
<bf:content>
|
| 353 |
+
<bf:Content rdf:about="http://id.loc.gov/vocabulary/contentTypes/txt">
|
| 354 |
+
<rdfs:label>text</rdfs:label>
|
| 355 |
+
<bf:code>txt</bf:code>
|
| 356 |
+
</bf:Content>
|
| 357 |
+
</bf:content>
|
| 358 |
+
|
| 359 |
+
<bf:classification>
|
| 360 |
+
<bf:ClassificationLcc>
|
| 361 |
+
<bf:classificationPortion>TT820</bf:classificationPortion>
|
| 362 |
+
<bf:itemPortion>.B877 2002</bf:itemPortion>
|
| 363 |
<bf:assigner>
|
| 364 |
+
<bf:Organization rdf:about="http://id.loc.gov/vocabulary/organizations/dlc">
|
| 365 |
+
<rdfs:label>United States, Library of Congress</rdfs:label>
|
| 366 |
</bf:Organization>
|
| 367 |
</bf:assigner>
|
| 368 |
+
</bf:ClassificationLcc>
|
| 369 |
+
</bf:classification>
|
| 370 |
+
|
| 371 |
+
<bf:adminMetadata>
|
| 372 |
+
<bf:AdminMetadata>
|
| 373 |
+
<bf:status>
|
| 374 |
+
<bf:Status rdf:about="http://id.loc.gov/vocabulary/mstatus/n">
|
| 375 |
+
<rdfs:label>new</rdfs:label>
|
| 376 |
+
<bf:code>n</bf:code>
|
| 377 |
+
</bf:Status>
|
| 378 |
+
</bf:status>
|
| 379 |
+
<bf:date rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2001-12-12</bf:date>
|
| 380 |
+
<bf:agent>
|
| 381 |
+
<bf:Agent rdf:about="http://id.loc.gov/vocabulary/organizations/dlc">
|
| 382 |
+
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Organization"/>
|
| 383 |
+
<rdfs:label>United States, Library of Congress</rdfs:label>
|
| 384 |
+
</bf:Agent>
|
| 385 |
+
</bf:agent>
|
| 386 |
</bf:AdminMetadata>
|
| 387 |
</bf:adminMetadata>
|
| 388 |
</bf:Work>
|
|
|
|
| 401 |
</bf:Work>
|
| 402 |
</rdf:RDF>'''
|
| 403 |
|
| 404 |
+
# BibFrame Few-Shot Examples (based on real Library of Congress records)
|
| 405 |
+
BIBFRAME_CORRECTION_EXAMPLES = {
|
| 406 |
+
"title_structure": {
|
| 407 |
+
"pattern": r"bf:title",
|
| 408 |
+
"wrong": """<bf:title>Simple Title String</bf:title>""",
|
| 409 |
+
"correct": """<bf:title>
|
| 410 |
+
<bf:Title>
|
| 411 |
+
<bf:mainTitle>The knitter's handy book of patterns</bf:mainTitle>
|
| 412 |
+
<bf:subtitle>basic designs in multiple sizes & gauges</bf:subtitle>
|
| 413 |
+
</bf:Title>
|
| 414 |
+
</bf:title>"""
|
| 415 |
+
},
|
| 416 |
+
"adminmetadata": {
|
| 417 |
+
"pattern": r"bf:adminMetadata|->bf:assigner",
|
| 418 |
+
"wrong": """<bf:adminMetadata>
|
| 419 |
+
<bf:AdminMetadata>
|
| 420 |
+
<bf:agent rdf:resource="http://example.org/org"/>
|
| 421 |
+
<bf:status>new</bf:status>
|
| 422 |
+
</bf:AdminMetadata>
|
| 423 |
+
</bf:adminMetadata>""",
|
| 424 |
+
"correct": """<bf:adminMetadata>
|
| 425 |
+
<bf:AdminMetadata>
|
| 426 |
+
<bf:status>
|
| 427 |
+
<bf:Status rdf:about="http://id.loc.gov/vocabulary/mstatus/n">
|
| 428 |
+
<rdfs:label>new</rdfs:label>
|
| 429 |
+
<bf:code>n</bf:code>
|
| 430 |
+
</bf:Status>
|
| 431 |
+
</bf:status>
|
| 432 |
+
<bf:date rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2001-12-12</bf:date>
|
| 433 |
+
<bf:agent>
|
| 434 |
+
<bf:Agent rdf:about="http://id.loc.gov/vocabulary/organizations/dlc">
|
| 435 |
+
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Organization"/>
|
| 436 |
+
<rdfs:label>United States, Library of Congress</rdfs:label>
|
| 437 |
+
</bf:Agent>
|
| 438 |
+
</bf:agent>
|
| 439 |
+
</bf:AdminMetadata>
|
| 440 |
+
</bf:adminMetadata>"""
|
| 441 |
+
},
|
| 442 |
+
"contribution": {
|
| 443 |
+
"pattern": r"bf:contribution",
|
| 444 |
+
"wrong": """<bf:contribution>Author Name</bf:contribution>""",
|
| 445 |
+
"correct": """<bf:contribution>
|
| 446 |
+
<bf:PrimaryContribution>
|
| 447 |
+
<bf:agent>
|
| 448 |
+
<bf:Agent rdf:about="http://id.loc.gov/rwo/agents/n2001017606">
|
| 449 |
+
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Person"/>
|
| 450 |
+
<rdfs:label>Budd, Ann, 1956-</rdfs:label>
|
| 451 |
+
</bf:Agent>
|
| 452 |
+
</bf:agent>
|
| 453 |
+
<bf:role>
|
| 454 |
+
<bf:Role rdf:about="http://id.loc.gov/vocabulary/relators/ctb">
|
| 455 |
+
<rdfs:label>contributor</rdfs:label>
|
| 456 |
+
<bf:code>ctb</bf:code>
|
| 457 |
+
</bf:Role>
|
| 458 |
+
</bf:role>
|
| 459 |
+
</bf:PrimaryContribution>
|
| 460 |
+
</bf:contribution>"""
|
| 461 |
+
},
|
| 462 |
+
"language": {
|
| 463 |
+
"pattern": r"bf:language",
|
| 464 |
+
"wrong": """<bf:language>English</bf:language>""",
|
| 465 |
+
"correct": """<bf:language>
|
| 466 |
+
<bf:Language rdf:about="http://id.loc.gov/vocabulary/languages/eng">
|
| 467 |
+
<rdfs:label xml:lang="en">English</rdfs:label>
|
| 468 |
+
<bf:code rdf:datatype="http://www.w3.org/2001/XMLSchema#string">eng</bf:code>
|
| 469 |
+
</bf:Language>
|
| 470 |
+
</bf:language>"""
|
| 471 |
+
},
|
| 472 |
+
"content": {
|
| 473 |
+
"pattern": r"bf:content",
|
| 474 |
+
"wrong": """<bf:content>Text</bf:content>""",
|
| 475 |
+
"correct": """<bf:content>
|
| 476 |
+
<bf:Content rdf:about="http://id.loc.gov/vocabulary/contentTypes/txt">
|
| 477 |
+
<rdfs:label>text</rdfs:label>
|
| 478 |
+
<bf:code>txt</bf:code>
|
| 479 |
+
</bf:Content>
|
| 480 |
+
</bf:content>"""
|
| 481 |
+
},
|
| 482 |
+
"classification": {
|
| 483 |
+
"pattern": r"bf:classification",
|
| 484 |
+
"wrong": """<bf:classification>TT820 .B877 2002</bf:classification>""",
|
| 485 |
+
"correct": """<bf:classification>
|
| 486 |
+
<bf:ClassificationLcc>
|
| 487 |
+
<bf:classificationPortion>TT820</bf:classificationPortion>
|
| 488 |
+
<bf:itemPortion>.B877 2002</bf:itemPortion>
|
| 489 |
+
<bf:assigner>
|
| 490 |
+
<bf:Organization rdf:about="http://id.loc.gov/vocabulary/organizations/dlc">
|
| 491 |
+
<rdfs:label>United States, Library of Congress</rdfs:label>
|
| 492 |
+
<bf:code rdf:datatype="http://id.loc.gov/datatypes/orgs/code">DLC</bf:code>
|
| 493 |
+
</bf:Organization>
|
| 494 |
+
</bf:assigner>
|
| 495 |
+
<bf:status>
|
| 496 |
+
<bf:Status rdf:about="http://id.loc.gov/vocabulary/mstatus/uba">
|
| 497 |
+
<rdfs:label>used by assigner</rdfs:label>
|
| 498 |
+
<bf:code>uba</bf:code>
|
| 499 |
+
</bf:Status>
|
| 500 |
+
</bf:status>
|
| 501 |
+
</bf:ClassificationLcc>
|
| 502 |
+
</bf:classification>"""
|
| 503 |
+
},
|
| 504 |
+
"subject": {
|
| 505 |
+
"pattern": r"bf:subject",
|
| 506 |
+
"wrong": """<bf:subject>Knitting--Patterns</bf:subject>""",
|
| 507 |
+
"correct": """<bf:subject>
|
| 508 |
+
<bf:Topic rdf:about="http://id.loc.gov/authorities/subjects/sh85072708">
|
| 509 |
+
<rdfs:label xml:lang="en">Knitting--Patterns</rdfs:label>
|
| 510 |
+
<madsrdf:componentList rdf:parseType="Collection">
|
| 511 |
+
<madsrdf:Authority>
|
| 512 |
+
<madsrdf:authoritativeLabel xml:lang="en">Knitting</madsrdf:authoritativeLabel>
|
| 513 |
+
<madsrdf:elementList>
|
| 514 |
+
<madsrdf:TopicElement>
|
| 515 |
+
<madsrdf:elementValue xml:lang="en">Knitting</madsrdf:elementValue>
|
| 516 |
+
</madsrdf:TopicElement>
|
| 517 |
+
</madsrdf:elementList>
|
| 518 |
+
</madsrdf:Authority>
|
| 519 |
+
</madsrdf:componentList>
|
| 520 |
+
</bf:Topic>
|
| 521 |
+
</bf:subject>"""
|
| 522 |
+
}
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
# MCP Server Tools (can be used independently)
|
| 526 |
def validate_rdf_tool(rdf_content: str, template: str = "monograph") -> dict:
|
| 527 |
"""
|
|
|
|
| 1000 |
# If no code blocks found, return the response as-is
|
| 1001 |
return response
|
| 1002 |
|
| 1003 |
+
def fix_common_rdf_errors(rdf_xml: str) -> str:
|
| 1004 |
+
"""
|
| 1005 |
+
Fix common RDF/XML errors that AI models generate.
|
| 1006 |
+
|
| 1007 |
+
Args:
|
| 1008 |
+
rdf_xml (str): RDF/XML that may contain common errors
|
| 1009 |
+
|
| 1010 |
+
Returns:
|
| 1011 |
+
str: Fixed RDF/XML
|
| 1012 |
+
"""
|
| 1013 |
+
import re
|
| 1014 |
+
|
| 1015 |
+
# Remove any rdf:parseType attributes (common AI mistake)
|
| 1016 |
+
rdf_xml = re.sub(r'\s+rdf:parseType="[^"]*"', '', rdf_xml)
|
| 1017 |
+
|
| 1018 |
+
# Fix bf:title if it's just a string (should be nested structure)
|
| 1019 |
+
rdf_xml = re.sub(
|
| 1020 |
+
r'<bf:title>([^<]+)</bf:title>',
|
| 1021 |
+
r'<bf:title><bf:Title><bf:mainTitle>\1</bf:mainTitle></bf:Title></bf:title>',
|
| 1022 |
+
rdf_xml
|
| 1023 |
+
)
|
| 1024 |
+
|
| 1025 |
+
# Fix bf:language if it's a string instead of URI
|
| 1026 |
+
language_map = {
|
| 1027 |
+
'English': 'http://id.loc.gov/vocabulary/languages/eng',
|
| 1028 |
+
'eng': 'http://id.loc.gov/vocabulary/languages/eng',
|
| 1029 |
+
'Spanish': 'http://id.loc.gov/vocabulary/languages/spa',
|
| 1030 |
+
'French': 'http://id.loc.gov/vocabulary/languages/fre',
|
| 1031 |
+
}
|
| 1032 |
+
for lang_text, lang_uri in language_map.items():
|
| 1033 |
+
rdf_xml = re.sub(
|
| 1034 |
+
f'<bf:language>{lang_text}</bf:language>',
|
| 1035 |
+
f'<bf:language rdf:resource="{lang_uri}"/>',
|
| 1036 |
+
rdf_xml,
|
| 1037 |
+
flags=re.IGNORECASE
|
| 1038 |
+
)
|
| 1039 |
+
|
| 1040 |
+
# Fix bf:content if it's a string
|
| 1041 |
+
content_map = {
|
| 1042 |
+
'Text': 'http://id.loc.gov/vocabulary/contentTypes/txt',
|
| 1043 |
+
'text': 'http://id.loc.gov/vocabulary/contentTypes/txt',
|
| 1044 |
+
}
|
| 1045 |
+
for content_text, content_uri in content_map.items():
|
| 1046 |
+
rdf_xml = re.sub(
|
| 1047 |
+
f'<bf:content>{content_text}</bf:content>',
|
| 1048 |
+
f'<bf:content rdf:resource="{content_uri}"/>',
|
| 1049 |
+
rdf_xml,
|
| 1050 |
+
flags=re.IGNORECASE
|
| 1051 |
+
)
|
| 1052 |
+
|
| 1053 |
+
return rdf_xml
|
| 1054 |
+
|
| 1055 |
def get_ai_correction(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
|
| 1056 |
"""
|
| 1057 |
Generate AI-powered corrected RDF/XML based on validation errors.
|
|
|
|
| 1164 |
- Else if <bf:descriptionModifier rdf:resource=\"...\"/> exists, add <bf:assigner rdf:resource=\"...\"/> with the SAME URI.
|
| 1165 |
- Else if a <bf:identifiedBy> block contains <bf:assigner rdf:resource=\"...\"/>, copy that URI to a TOP-LEVEL <bf:assigner>.
|
| 1166 |
Keep all existing content; only add missing <bf:assigner> where required.
|
| 1167 |
+
"""
|
| 1168 |
+
|
| 1169 |
+
# Build few-shot examples based on the errors found
|
| 1170 |
+
examples_to_include = []
|
| 1171 |
+
validation_lower = validation_results.lower()
|
| 1172 |
+
|
| 1173 |
+
# Check each example pattern against validation results
|
| 1174 |
+
for name, example in BIBFRAME_CORRECTION_EXAMPLES.items():
|
| 1175 |
+
pattern = example.get("pattern", name)
|
| 1176 |
+
if re.search(pattern, validation_results, re.IGNORECASE):
|
| 1177 |
+
examples_to_include.append((name, example))
|
| 1178 |
+
if steps_log is not None:
|
| 1179 |
+
steps_log.append(f"Including {name} example based on pattern match")
|
| 1180 |
+
|
| 1181 |
+
few_shot_section = ""
|
| 1182 |
+
if examples_to_include:
|
| 1183 |
+
few_shot_section = "\n\nCORRECT BIBFRAME PATTERNS (from Library of Congress records):\n"
|
| 1184 |
+
few_shot_section += "NEVER use simple strings - always use nested structures as shown below:\n\n"
|
| 1185 |
+
for name, example in examples_to_include:
|
| 1186 |
+
few_shot_section += f"{name.upper()}:\n"
|
| 1187 |
+
few_shot_section += f"❌ WRONG:\n```xml\n{example['wrong']}\n```\n"
|
| 1188 |
+
few_shot_section += f"✅ CORRECT:\n```xml\n{example['correct']}\n```\n\n"
|
| 1189 |
+
|
| 1190 |
+
# Add critical rules based on real patterns
|
| 1191 |
+
critical_rules = """
|
| 1192 |
+
CRITICAL RDF/XML RULES (from real BibFrame):
|
| 1193 |
+
1. NEVER use rdf:parseType except for "Collection" on madsrdf:componentList
|
| 1194 |
+
2. Properties like bf:title, bf:language, bf:content MUST have nested typed resources
|
| 1195 |
+
3. Use rdf:about for resource URIs, not rdf:resource on the property element
|
| 1196 |
+
4. bf:adminMetadata can appear multiple times in one record
|
| 1197 |
+
5. Status, Role, Language etc. are OBJECTS with rdf:about URIs, not literals
|
| 1198 |
+
6. Date values use rdf:datatype for typing (e.g., xsd:date, xsd:dateTime)
|
| 1199 |
+
7. Every bf:AdminMetadata needs BOTH bf:agent AND bf:assigner if validation requires it
|
| 1200 |
"""
|
| 1201 |
|
| 1202 |
prompt = f"""You are an expert in RDF/XML and BibFrame cataloging. Fix the following RDF/XML based on the validation errors and official BibFrame documentation.
|
|
|
|
| 1204 |
{severity_instruction}
|
| 1205 |
{admin_guidance}
|
| 1206 |
{guidance_section}
|
| 1207 |
+
{critical_rules}
|
| 1208 |
+
{few_shot_section}
|
| 1209 |
|
| 1210 |
Validation Errors:
|
| 1211 |
{validation_results}
|
|
|
|
| 1215 |
|
| 1216 |
{f"Previous attempt {attempt} still had validation errors. Please fix ALL issues this time." if attempt > 0 else ""}
|
| 1217 |
|
| 1218 |
+
INSTRUCTIONS:
|
| 1219 |
+
1. Return ONLY valid RDF/XML - no explanations
|
| 1220 |
+
2. Follow the EXACT patterns shown in the examples above
|
| 1221 |
+
3. Use proper nested structures - NO simple string values for complex properties
|
| 1222 |
+
4. Keep ALL namespace declarations
|
| 1223 |
+
5. Fix ALL validation errors"""
|
|
|
|
| 1224 |
|
| 1225 |
try:
|
| 1226 |
+
# Update system prompt to be even more explicit
|
| 1227 |
+
system_prompt = """You are an RDF/XML expert following Library of Congress BibFrame patterns.
|
| 1228 |
+
Output ONLY valid RDF/XML following these rules:
|
| 1229 |
+
- Start with <?xml version="1.0" encoding="UTF-8"?>
|
| 1230 |
+
- NO markdown, NO explanations
|
| 1231 |
+
- Use EXACT structure patterns from the examples
|
| 1232 |
+
- Complex properties need nested typed resources
|
| 1233 |
+
- rdf:parseType ONLY for Collection on madsrdf:componentList
|
| 1234 |
+
- Status/Role/Language are OBJECTS with URIs, not strings"""
|
| 1235 |
+
|
| 1236 |
chat_completion = client.chat.completions.create(
|
| 1237 |
model=HF_MODEL,
|
| 1238 |
messages=[
|
| 1239 |
{
|
| 1240 |
"role": "system",
|
| 1241 |
+
"content": system_prompt
|
| 1242 |
},
|
| 1243 |
{
|
| 1244 |
"role": "user",
|
|
|
|
| 1252 |
|
| 1253 |
corrected_rdf = chat_completion.choices[0].message.content.strip()
|
| 1254 |
if steps_log is not None:
|
| 1255 |
+
steps_log.append(f"Attempt {attempt_no}: model responded; extracting and fixing common errors")
|
| 1256 |
|
| 1257 |
# Extract RDF content if it's wrapped in code blocks
|
| 1258 |
corrected_rdf = extract_rdf_from_response(corrected_rdf)
|
| 1259 |
|
| 1260 |
+
# Fix common AI mistakes
|
| 1261 |
+
corrected_rdf = fix_common_rdf_errors(corrected_rdf)
|
| 1262 |
+
|
| 1263 |
# Only validate if we have the validator and haven't hit timeout
|
| 1264 |
if VALIDATOR_AVAILABLE and (time.time() - start_time < timeout - 10):
|
| 1265 |
try:
|