RDF Validation Deployment commited on
Commit
f037349
·
1 Parent(s): bdd95fd

LLM: unify get_ai_correction; XML-only system prompt; deterministic temperature; AdminMetadata assigner guidance; lower temp for suggestions

Browse files
Files changed (1) hide show
  1. app.py +21 -147
app.py CHANGED
@@ -375,151 +375,7 @@ def extract_relevant_rdf_section(rdf_content: str, class_name: str) -> str:
375
 
376
  return rdf_content[:1000] # Fallback
377
 
378
- def get_ai_correction(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
379
- """
380
- Generate AI-powered corrected RDF/XML based on validation errors.
381
-
382
- This tool takes invalid RDF/XML and validation results, then generates
383
- a corrected version that addresses all identified validation issues.
384
- The generated correction is validated before being returned to the user.
385
-
386
- Args:
387
- validation_results (str): The validation error messages
388
- rdf_content (str): The original invalid RDF/XML content
389
- template (str): The validation template to use
390
- max_attempts (int): Maximum number of attempts to generate valid RDF (uses MAX_CORRECTION_ATTEMPTS if None)
391
- include_warnings (bool): Whether to fix warnings in addition to violations
392
-
393
- Returns:
394
- str: Corrected RDF/XML that should pass validation
395
- """
396
-
397
- # Determine whether to iterate based on parameter or global default
398
- iterate_enabled = ENABLE_VALIDATION_LOOP if enable_validation_loop is None else enable_validation_loop
399
- if steps_log is not None:
400
- steps_log.append(f"Planning correction: iterate_enabled={iterate_enabled}, include_warnings={include_warnings}")
401
- # Use configuration default if not specified
402
- if max_attempts is None:
403
- max_attempts = MAX_CORRECTION_ATTEMPTS
404
- if steps_log is not None:
405
- steps_log.append(f"Max attempts set to {max_attempts}")
406
- # If iteration disabled, force single attempt
407
- if not iterate_enabled:
408
- max_attempts = 1
409
- if steps_log is not None:
410
- steps_log.append("Iteration disabled; forcing single attempt")
411
-
412
- if not OPENAI_AVAILABLE:
413
- if steps_log is not None:
414
- steps_log.append("OPENAI client not available; falling back to manual hints")
415
- return generate_manual_correction_hints(validation_results, rdf_content)
416
-
417
- # Get API key dynamically at runtime
418
- current_api_key = os.getenv('HF_API_KEY', '')
419
- if not current_api_key:
420
- if steps_log is not None:
421
- steps_log.append("HF_API_KEY not set; cannot call model; returning manual hints")
422
- return f"""<!-- AI correction disabled: Set HF_API_KEY as a Secret in your Space settings -->
423
-
424
- {generate_manual_correction_hints(validation_results, rdf_content)}"""
425
-
426
- try:
427
- client = get_openai_client()
428
- if not client:
429
- if steps_log is not None:
430
- steps_log.append("Failed to initialize OpenAI client; returning manual hints")
431
- return f"""<!-- AI correction disabled: HF_API_KEY not configured -->
432
-
433
- {generate_manual_correction_hints(validation_results, rdf_content)}"""
434
-
435
- # Add timeout protection
436
- import time
437
- start_time = time.time()
438
- timeout = 120 # Increased to 120 second total timeout
439
- if steps_log is not None:
440
- steps_log.append(f"Timeout budget: {timeout}s total")
441
-
442
- severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
443
-
444
- # Filter validation results by class
445
- class_results = filter_validation_results_by_class(validation_results, rdf_content)
446
-
447
- # Process each class separately to avoid overwhelming the LLM
448
- corrected_sections = {}
449
-
450
- for class_name, class_errors in class_results.items():
451
- if not class_errors:
452
- continue
453
-
454
- # Check timeout
455
- if time.time() - start_time > timeout - 10:
456
- print(f"⏰ Approaching timeout, skipping {class_name}")
457
- break
458
-
459
- print(f"🔄 Correcting {class_name} section")
460
-
461
- # Extract relevant section
462
- relevant_section = extract_relevant_rdf_section(rdf_content, class_name)
463
-
464
- base_prompt = f"""Fix this {class_name} RDF section based on these specific errors.
465
-
466
- {severity_instruction}
467
-
468
- Errors for {class_name}:
469
- {class_errors[:800]}
470
-
471
- Current {class_name} RDF:
472
- {relevant_section[:800]}
473
-
474
- Return ONLY the corrected {class_name} XML section. No explanations."""
475
- # Targeted guidance for AdminMetadata -> bf:assigner
476
- if class_name == 'AdminMetadata' and ('bf:assigner' in class_errors or '->bf:assigner' in class_errors):
477
- guidance = """
478
- Every <bf:AdminMetadata> MUST have a direct <bf:assigner> child.
479
- If <bf:agent rdf:resource="..."/> exists, add <bf:assigner rdf:resource="..."/> with the SAME URI.
480
- If <bf:descriptionModifier rdf:resource="..."/> exists, add <bf:assigner rdf:resource="..."/> with the SAME URI.
481
- If neither exists but there is a <bf:identifiedBy> ... <bf:assigner rdf:resource="..."/> inside, copy that value to a TOP-LEVEL <bf:assigner> under <bf:AdminMetadata>.
482
- Keep all existing content; just add the missing <bf:assigner>.
483
- """
484
- prompt = guidance + "\n\n" + base_prompt
485
- else:
486
- prompt = base_prompt
487
-
488
- try:
489
- chat_completion = client.chat.completions.create(
490
- model=HF_MODEL,
491
- messages=[
492
- {
493
- "role": "user",
494
- "content": prompt
495
- }
496
- ],
497
- max_tokens=1000,
498
- temperature=0.3,
499
- timeout=45 # Increased per-section timeout
500
- )
501
-
502
- corrected_section = chat_completion.choices[0].message.content.strip()
503
- corrected_sections[class_name] = extract_rdf_from_response(corrected_section)
504
-
505
- except Exception as e:
506
- print(f"❌ Error correcting {class_name}: {str(e)}")
507
- continue
508
-
509
- # Merge corrections back into original RDF
510
- if corrected_sections:
511
- corrected_rdf = merge_corrected_sections(rdf_content, corrected_sections)
512
- return f"""<!-- AI-generated correction (class-based processing) -->
513
- {corrected_rdf}"""
514
- else:
515
- return f"""<!-- AI correction failed - timeout or errors -->
516
- {generate_manual_correction_hints(validation_results, rdf_content)}"""
517
-
518
- except Exception as e:
519
- logger.error(f"LLM API error: {str(e)}")
520
- return f"""<!-- Error generating AI correction: {str(e)} -->
521
-
522
- {generate_manual_correction_hints(validation_results, rdf_content)}"""
523
 
524
  def merge_corrected_sections(original_rdf: str, corrected_sections: dict) -> str:
525
  """
@@ -755,7 +611,7 @@ def get_ai_correction(validation_results: str, rdf_content: str, template: str =
755
  timeout = 120 # Increased to 120 second total timeout
756
  if steps_log is not None:
757
  steps_log.append(f"Timeout budget: {timeout}s total")
758
-
759
  severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
760
 
761
  # Try multiple attempts to generate valid RDF
@@ -773,9 +629,23 @@ def get_ai_correction(validation_results: str, rdf_content: str, template: str =
773
  steps_log.append(f"Attempt {attempt_no}/{max_attempts}: requesting model correction")
774
  print(f"🔄 Correction attempt {attempt_no}/{max_attempts}")
775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
  prompt = f"""You are an expert in RDF/XML. Fix the following RDF/XML based on the validation errors provided.
777
 
778
  {severity_instruction}
 
779
 
780
  Validation Errors:
781
  {validation_results}
@@ -796,13 +666,17 @@ Please provide the corrected RDF/XML that addresses all validation issues.
796
  chat_completion = client.chat.completions.create(
797
  model=HF_MODEL,
798
  messages=[
 
 
 
 
799
  {
800
  "role": "user",
801
  "content": prompt
802
  }
803
  ],
804
  max_tokens=2000,
805
- temperature=0.3,
806
  timeout=60 # Increased to 60 second timeout per API call
807
  )
808
 
 
375
 
376
  return rdf_content[:1000] # Fallback
377
 
378
+ ## [Removed duplicate get_ai_correction definition unified below]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
  def merge_corrected_sections(original_rdf: str, corrected_sections: dict) -> str:
381
  """
 
611
  timeout = 120 # Increased to 120 second total timeout
612
  if steps_log is not None:
613
  steps_log.append(f"Timeout budget: {timeout}s total")
614
+
615
  severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
616
 
617
  # Try multiple attempts to generate valid RDF
 
629
  steps_log.append(f"Attempt {attempt_no}/{max_attempts}: requesting model correction")
630
  print(f"🔄 Correction attempt {attempt_no}/{max_attempts}")
631
 
632
+ # Targeted AdminMetadata guidance inferred from results text
633
+ needs_assigner = ("->bf:assigner" in validation_results) or (" bf:assigner" in validation_results)
634
+ admin_guidance = ""
635
+ if needs_assigner:
636
+ admin_guidance = """
637
+ IMPORTANT: For each <bf:AdminMetadata>, ensure it has a direct child <bf:assigner>.
638
+ Rules:
639
+ - If <bf:agent rdf:resource=\"...\"/> exists, add <bf:assigner rdf:resource=\"...\"/> with the SAME URI.
640
+ - Else if <bf:descriptionModifier rdf:resource=\"...\"/> exists, add <bf:assigner rdf:resource=\"...\"/> with the SAME URI.
641
+ - Else if a <bf:identifiedBy> block contains <bf:assigner rdf:resource=\"...\"/>, copy that URI to a TOP-LEVEL <bf:assigner>.
642
+ Keep all existing content; only add missing <bf:assigner> where required.
643
+ """
644
+
645
  prompt = f"""You are an expert in RDF/XML. Fix the following RDF/XML based on the validation errors provided.
646
 
647
  {severity_instruction}
648
+ {admin_guidance}
649
 
650
  Validation Errors:
651
  {validation_results}
 
666
  chat_completion = client.chat.completions.create(
667
  model=HF_MODEL,
668
  messages=[
669
+ {
670
+ "role": "system",
671
+ "content": "Return only valid RDF/XML content. No prose, no markdown, no code fences, no explanations."
672
+ },
673
  {
674
  "role": "user",
675
  "content": prompt
676
  }
677
  ],
678
  max_tokens=2000,
679
+ temperature=0.0,
680
  timeout=60 # Increased to 60 second timeout per API call
681
  )
682