RDF Validation Deployment
commited on
Commit
·
f037349
1
Parent(s):
bdd95fd
LLM: unify get_ai_correction; XML-only system prompt; deterministic temperature; AdminMetadata assigner guidance; lower temp for suggestions
Browse files
app.py
CHANGED
|
@@ -375,151 +375,7 @@ def extract_relevant_rdf_section(rdf_content: str, class_name: str) -> str:
|
|
| 375 |
|
| 376 |
return rdf_content[:1000] # Fallback
|
| 377 |
|
| 378 |
-
|
| 379 |
-
"""
|
| 380 |
-
Generate AI-powered corrected RDF/XML based on validation errors.
|
| 381 |
-
|
| 382 |
-
This tool takes invalid RDF/XML and validation results, then generates
|
| 383 |
-
a corrected version that addresses all identified validation issues.
|
| 384 |
-
The generated correction is validated before being returned to the user.
|
| 385 |
-
|
| 386 |
-
Args:
|
| 387 |
-
validation_results (str): The validation error messages
|
| 388 |
-
rdf_content (str): The original invalid RDF/XML content
|
| 389 |
-
template (str): The validation template to use
|
| 390 |
-
max_attempts (int): Maximum number of attempts to generate valid RDF (uses MAX_CORRECTION_ATTEMPTS if None)
|
| 391 |
-
include_warnings (bool): Whether to fix warnings in addition to violations
|
| 392 |
-
|
| 393 |
-
Returns:
|
| 394 |
-
str: Corrected RDF/XML that should pass validation
|
| 395 |
-
"""
|
| 396 |
-
|
| 397 |
-
# Determine whether to iterate based on parameter or global default
|
| 398 |
-
iterate_enabled = ENABLE_VALIDATION_LOOP if enable_validation_loop is None else enable_validation_loop
|
| 399 |
-
if steps_log is not None:
|
| 400 |
-
steps_log.append(f"Planning correction: iterate_enabled={iterate_enabled}, include_warnings={include_warnings}")
|
| 401 |
-
# Use configuration default if not specified
|
| 402 |
-
if max_attempts is None:
|
| 403 |
-
max_attempts = MAX_CORRECTION_ATTEMPTS
|
| 404 |
-
if steps_log is not None:
|
| 405 |
-
steps_log.append(f"Max attempts set to {max_attempts}")
|
| 406 |
-
# If iteration disabled, force single attempt
|
| 407 |
-
if not iterate_enabled:
|
| 408 |
-
max_attempts = 1
|
| 409 |
-
if steps_log is not None:
|
| 410 |
-
steps_log.append("Iteration disabled; forcing single attempt")
|
| 411 |
-
|
| 412 |
-
if not OPENAI_AVAILABLE:
|
| 413 |
-
if steps_log is not None:
|
| 414 |
-
steps_log.append("OPENAI client not available; falling back to manual hints")
|
| 415 |
-
return generate_manual_correction_hints(validation_results, rdf_content)
|
| 416 |
-
|
| 417 |
-
# Get API key dynamically at runtime
|
| 418 |
-
current_api_key = os.getenv('HF_API_KEY', '')
|
| 419 |
-
if not current_api_key:
|
| 420 |
-
if steps_log is not None:
|
| 421 |
-
steps_log.append("HF_API_KEY not set; cannot call model; returning manual hints")
|
| 422 |
-
return f"""<!-- AI correction disabled: Set HF_API_KEY as a Secret in your Space settings -->
|
| 423 |
-
|
| 424 |
-
{generate_manual_correction_hints(validation_results, rdf_content)}"""
|
| 425 |
-
|
| 426 |
-
try:
|
| 427 |
-
client = get_openai_client()
|
| 428 |
-
if not client:
|
| 429 |
-
if steps_log is not None:
|
| 430 |
-
steps_log.append("Failed to initialize OpenAI client; returning manual hints")
|
| 431 |
-
return f"""<!-- AI correction disabled: HF_API_KEY not configured -->
|
| 432 |
-
|
| 433 |
-
{generate_manual_correction_hints(validation_results, rdf_content)}"""
|
| 434 |
-
|
| 435 |
-
# Add timeout protection
|
| 436 |
-
import time
|
| 437 |
-
start_time = time.time()
|
| 438 |
-
timeout = 120 # Increased to 120 second total timeout
|
| 439 |
-
if steps_log is not None:
|
| 440 |
-
steps_log.append(f"Timeout budget: {timeout}s total")
|
| 441 |
-
|
| 442 |
-
severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
|
| 443 |
-
|
| 444 |
-
# Filter validation results by class
|
| 445 |
-
class_results = filter_validation_results_by_class(validation_results, rdf_content)
|
| 446 |
-
|
| 447 |
-
# Process each class separately to avoid overwhelming the LLM
|
| 448 |
-
corrected_sections = {}
|
| 449 |
-
|
| 450 |
-
for class_name, class_errors in class_results.items():
|
| 451 |
-
if not class_errors:
|
| 452 |
-
continue
|
| 453 |
-
|
| 454 |
-
# Check timeout
|
| 455 |
-
if time.time() - start_time > timeout - 10:
|
| 456 |
-
print(f"⏰ Approaching timeout, skipping {class_name}")
|
| 457 |
-
break
|
| 458 |
-
|
| 459 |
-
print(f"🔄 Correcting {class_name} section")
|
| 460 |
-
|
| 461 |
-
# Extract relevant section
|
| 462 |
-
relevant_section = extract_relevant_rdf_section(rdf_content, class_name)
|
| 463 |
-
|
| 464 |
-
base_prompt = f"""Fix this {class_name} RDF section based on these specific errors.
|
| 465 |
-
|
| 466 |
-
{severity_instruction}
|
| 467 |
-
|
| 468 |
-
Errors for {class_name}:
|
| 469 |
-
{class_errors[:800]}
|
| 470 |
-
|
| 471 |
-
Current {class_name} RDF:
|
| 472 |
-
{relevant_section[:800]}
|
| 473 |
-
|
| 474 |
-
Return ONLY the corrected {class_name} XML section. No explanations."""
|
| 475 |
-
# Targeted guidance for AdminMetadata -> bf:assigner
|
| 476 |
-
if class_name == 'AdminMetadata' and ('bf:assigner' in class_errors or '->bf:assigner' in class_errors):
|
| 477 |
-
guidance = """
|
| 478 |
-
Every <bf:AdminMetadata> MUST have a direct <bf:assigner> child.
|
| 479 |
-
If <bf:agent rdf:resource="..."/> exists, add <bf:assigner rdf:resource="..."/> with the SAME URI.
|
| 480 |
-
If <bf:descriptionModifier rdf:resource="..."/> exists, add <bf:assigner rdf:resource="..."/> with the SAME URI.
|
| 481 |
-
If neither exists but there is a <bf:identifiedBy> ... <bf:assigner rdf:resource="..."/> inside, copy that value to a TOP-LEVEL <bf:assigner> under <bf:AdminMetadata>.
|
| 482 |
-
Keep all existing content; just add the missing <bf:assigner>.
|
| 483 |
-
"""
|
| 484 |
-
prompt = guidance + "\n\n" + base_prompt
|
| 485 |
-
else:
|
| 486 |
-
prompt = base_prompt
|
| 487 |
-
|
| 488 |
-
try:
|
| 489 |
-
chat_completion = client.chat.completions.create(
|
| 490 |
-
model=HF_MODEL,
|
| 491 |
-
messages=[
|
| 492 |
-
{
|
| 493 |
-
"role": "user",
|
| 494 |
-
"content": prompt
|
| 495 |
-
}
|
| 496 |
-
],
|
| 497 |
-
max_tokens=1000,
|
| 498 |
-
temperature=0.3,
|
| 499 |
-
timeout=45 # Increased per-section timeout
|
| 500 |
-
)
|
| 501 |
-
|
| 502 |
-
corrected_section = chat_completion.choices[0].message.content.strip()
|
| 503 |
-
corrected_sections[class_name] = extract_rdf_from_response(corrected_section)
|
| 504 |
-
|
| 505 |
-
except Exception as e:
|
| 506 |
-
print(f"❌ Error correcting {class_name}: {str(e)}")
|
| 507 |
-
continue
|
| 508 |
-
|
| 509 |
-
# Merge corrections back into original RDF
|
| 510 |
-
if corrected_sections:
|
| 511 |
-
corrected_rdf = merge_corrected_sections(rdf_content, corrected_sections)
|
| 512 |
-
return f"""<!-- AI-generated correction (class-based processing) -->
|
| 513 |
-
{corrected_rdf}"""
|
| 514 |
-
else:
|
| 515 |
-
return f"""<!-- AI correction failed - timeout or errors -->
|
| 516 |
-
{generate_manual_correction_hints(validation_results, rdf_content)}"""
|
| 517 |
-
|
| 518 |
-
except Exception as e:
|
| 519 |
-
logger.error(f"LLM API error: {str(e)}")
|
| 520 |
-
return f"""<!-- Error generating AI correction: {str(e)} -->
|
| 521 |
-
|
| 522 |
-
{generate_manual_correction_hints(validation_results, rdf_content)}"""
|
| 523 |
|
| 524 |
def merge_corrected_sections(original_rdf: str, corrected_sections: dict) -> str:
|
| 525 |
"""
|
|
@@ -755,7 +611,7 @@ def get_ai_correction(validation_results: str, rdf_content: str, template: str =
|
|
| 755 |
timeout = 120 # Increased to 120 second total timeout
|
| 756 |
if steps_log is not None:
|
| 757 |
steps_log.append(f"Timeout budget: {timeout}s total")
|
| 758 |
-
|
| 759 |
severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
|
| 760 |
|
| 761 |
# Try multiple attempts to generate valid RDF
|
|
@@ -773,9 +629,23 @@ def get_ai_correction(validation_results: str, rdf_content: str, template: str =
|
|
| 773 |
steps_log.append(f"Attempt {attempt_no}/{max_attempts}: requesting model correction")
|
| 774 |
print(f"🔄 Correction attempt {attempt_no}/{max_attempts}")
|
| 775 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
prompt = f"""You are an expert in RDF/XML. Fix the following RDF/XML based on the validation errors provided.
|
| 777 |
|
| 778 |
{severity_instruction}
|
|
|
|
| 779 |
|
| 780 |
Validation Errors:
|
| 781 |
{validation_results}
|
|
@@ -796,13 +666,17 @@ Please provide the corrected RDF/XML that addresses all validation issues.
|
|
| 796 |
chat_completion = client.chat.completions.create(
|
| 797 |
model=HF_MODEL,
|
| 798 |
messages=[
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
{
|
| 800 |
"role": "user",
|
| 801 |
"content": prompt
|
| 802 |
}
|
| 803 |
],
|
| 804 |
max_tokens=2000,
|
| 805 |
-
temperature=0.
|
| 806 |
timeout=60 # Increased to 60 second timeout per API call
|
| 807 |
)
|
| 808 |
|
|
|
|
| 375 |
|
| 376 |
return rdf_content[:1000] # Fallback
|
| 377 |
|
| 378 |
+
## [Removed duplicate get_ai_correction definition – unified below]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
def merge_corrected_sections(original_rdf: str, corrected_sections: dict) -> str:
|
| 381 |
"""
|
|
|
|
| 611 |
timeout = 120 # Increased to 120 second total timeout
|
| 612 |
if steps_log is not None:
|
| 613 |
steps_log.append(f"Timeout budget: {timeout}s total")
|
| 614 |
+
|
| 615 |
severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
|
| 616 |
|
| 617 |
# Try multiple attempts to generate valid RDF
|
|
|
|
| 629 |
steps_log.append(f"Attempt {attempt_no}/{max_attempts}: requesting model correction")
|
| 630 |
print(f"🔄 Correction attempt {attempt_no}/{max_attempts}")
|
| 631 |
|
| 632 |
+
# Targeted AdminMetadata guidance inferred from results text
|
| 633 |
+
needs_assigner = ("->bf:assigner" in validation_results) or (" bf:assigner" in validation_results)
|
| 634 |
+
admin_guidance = ""
|
| 635 |
+
if needs_assigner:
|
| 636 |
+
admin_guidance = """
|
| 637 |
+
IMPORTANT: For each <bf:AdminMetadata>, ensure it has a direct child <bf:assigner>.
|
| 638 |
+
Rules:
|
| 639 |
+
- If <bf:agent rdf:resource=\"...\"/> exists, add <bf:assigner rdf:resource=\"...\"/> with the SAME URI.
|
| 640 |
+
- Else if <bf:descriptionModifier rdf:resource=\"...\"/> exists, add <bf:assigner rdf:resource=\"...\"/> with the SAME URI.
|
| 641 |
+
- Else if a <bf:identifiedBy> block contains <bf:assigner rdf:resource=\"...\"/>, copy that URI to a TOP-LEVEL <bf:assigner>.
|
| 642 |
+
Keep all existing content; only add missing <bf:assigner> where required.
|
| 643 |
+
"""
|
| 644 |
+
|
| 645 |
prompt = f"""You are an expert in RDF/XML. Fix the following RDF/XML based on the validation errors provided.
|
| 646 |
|
| 647 |
{severity_instruction}
|
| 648 |
+
{admin_guidance}
|
| 649 |
|
| 650 |
Validation Errors:
|
| 651 |
{validation_results}
|
|
|
|
| 666 |
chat_completion = client.chat.completions.create(
|
| 667 |
model=HF_MODEL,
|
| 668 |
messages=[
|
| 669 |
+
{
|
| 670 |
+
"role": "system",
|
| 671 |
+
"content": "Return only valid RDF/XML content. No prose, no markdown, no code fences, no explanations."
|
| 672 |
+
},
|
| 673 |
{
|
| 674 |
"role": "user",
|
| 675 |
"content": prompt
|
| 676 |
}
|
| 677 |
],
|
| 678 |
max_tokens=2000,
|
| 679 |
+
temperature=0.0,
|
| 680 |
timeout=60 # Increased to 60 second timeout per API call
|
| 681 |
)
|
| 682 |
|