Spaces:

jimfhahn
/

mcp4rdf

Sleeping

App Files Files Community

RDF Validation Deployment commited on Sep 21

Commit

f037349

1 Parent(s): bdd95fd

LLM: unify get_ai_correction; XML-only system prompt; deterministic temperature; AdminMetadata assigner guidance; lower temp for suggestions

Browse files

Files changed (1) hide show

app.py +21 -147

app.py CHANGED Viewed

@@ -375,151 +375,7 @@ def extract_relevant_rdf_section(rdf_content: str, class_name: str) -> str:
     return rdf_content[:1000]  # Fallback
-def get_ai_correction(validation_results: str, rdf_content: str, template: str = 'monograph', max_attempts: int = None, include_warnings: bool = False, enable_validation_loop: bool | None = None, steps_log: Optional[List[str]] = None) -> str:
-    """
-    Generate AI-powered corrected RDF/XML based on validation errors.
-    This tool takes invalid RDF/XML and validation results, then generates
-    a corrected version that addresses all identified validation issues.
-    The generated correction is validated before being returned to the user.
-    Args:
-        validation_results (str): The validation error messages
-        rdf_content (str): The original invalid RDF/XML content
-        template (str): The validation template to use
-        max_attempts (int): Maximum number of attempts to generate valid RDF (uses MAX_CORRECTION_ATTEMPTS if None)
-        include_warnings (bool): Whether to fix warnings in addition to violations
-    Returns:
-        str: Corrected RDF/XML that should pass validation
-    """
-    # Determine whether to iterate based on parameter or global default
-    iterate_enabled = ENABLE_VALIDATION_LOOP if enable_validation_loop is None else enable_validation_loop
-    if steps_log is not None:
-        steps_log.append(f"Planning correction: iterate_enabled={iterate_enabled}, include_warnings={include_warnings}")
-    # Use configuration default if not specified
-    if max_attempts is None:
-        max_attempts = MAX_CORRECTION_ATTEMPTS
-    if steps_log is not None:
-        steps_log.append(f"Max attempts set to {max_attempts}")
-    # If iteration disabled, force single attempt
-    if not iterate_enabled:
-        max_attempts = 1
-        if steps_log is not None:
-            steps_log.append("Iteration disabled; forcing single attempt")
-    if not OPENAI_AVAILABLE:
-        if steps_log is not None:
-            steps_log.append("OPENAI client not available; falling back to manual hints")
-        return generate_manual_correction_hints(validation_results, rdf_content)
-    # Get API key dynamically at runtime
-    current_api_key = os.getenv('HF_API_KEY', '')
-    if not current_api_key:
-        if steps_log is not None:
-            steps_log.append("HF_API_KEY not set; cannot call model; returning manual hints")
-        return f"""<!-- AI correction disabled: Set HF_API_KEY as a Secret in your Space settings -->
-{generate_manual_correction_hints(validation_results, rdf_content)}"""
-    try:
-        client = get_openai_client()
-        if not client:
-            if steps_log is not None:
-                steps_log.append("Failed to initialize OpenAI client; returning manual hints")
-            return f"""<!-- AI correction disabled: HF_API_KEY not configured -->
-{generate_manual_correction_hints(validation_results, rdf_content)}"""
-        # Add timeout protection
-        import time
-        start_time = time.time()
-        timeout = 120  # Increased to 120 second total timeout
-        if steps_log is not None:
-            steps_log.append(f"Timeout budget: {timeout}s total")
-        severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
-        # Filter validation results by class
-        class_results = filter_validation_results_by_class(validation_results, rdf_content)
-        # Process each class separately to avoid overwhelming the LLM
-        corrected_sections = {}
-        for class_name, class_errors in class_results.items():
-            if not class_errors:
-                continue
-            # Check timeout
-            if time.time() - start_time > timeout - 10:
-                print(f"⏰ Approaching timeout, skipping {class_name}")
-                break
-            print(f"🔄 Correcting {class_name} section")
-            # Extract relevant section
-            relevant_section = extract_relevant_rdf_section(rdf_content, class_name)
-            base_prompt = f"""Fix this {class_name} RDF section based on these specific errors.
-{severity_instruction}
-Errors for {class_name}:
-{class_errors[:800]}
-Current {class_name} RDF:
-{relevant_section[:800]}
-Return ONLY the corrected {class_name} XML section. No explanations."""
-            # Targeted guidance for AdminMetadata -> bf:assigner
-            if class_name == 'AdminMetadata' and ('bf:assigner' in class_errors or '->bf:assigner' in class_errors):
-                guidance = """
-Every <bf:AdminMetadata> MUST have a direct <bf:assigner> child.
-If <bf:agent rdf:resource="..."/> exists, add <bf:assigner rdf:resource="..."/> with the SAME URI.
-If <bf:descriptionModifier rdf:resource="..."/> exists, add <bf:assigner rdf:resource="..."/> with the SAME URI.
-If neither exists but there is a <bf:identifiedBy> ... <bf:assigner rdf:resource="..."/> inside, copy that value to a TOP-LEVEL <bf:assigner> under <bf:AdminMetadata>.
-Keep all existing content; just add the missing <bf:assigner>.
-"""
-                prompt = guidance + "\n\n" + base_prompt
-            else:
-                prompt = base_prompt
-            try:
-                chat_completion = client.chat.completions.create(
-                    model=HF_MODEL,
-                    messages=[
-                        {
-                            "role": "user",
-                            "content": prompt
-                        }
-                    ],
-                    max_tokens=1000,
-                    temperature=0.3,
-                    timeout=45  # Increased per-section timeout
-                )
-                corrected_section = chat_completion.choices[0].message.content.strip()
-                corrected_sections[class_name] = extract_rdf_from_response(corrected_section)
-            except Exception as e:
-                print(f"❌ Error correcting {class_name}: {str(e)}")
-                continue
-        # Merge corrections back into original RDF
-        if corrected_sections:
-            corrected_rdf = merge_corrected_sections(rdf_content, corrected_sections)
-            return f"""<!-- AI-generated correction (class-based processing) -->
-{corrected_rdf}"""
-        else:
-            return f"""<!-- AI correction failed - timeout or errors -->
-{generate_manual_correction_hints(validation_results, rdf_content)}"""
-    except Exception as e:
-        logger.error(f"LLM API error: {str(e)}")
-        return f"""<!-- Error generating AI correction: {str(e)} -->
-{generate_manual_correction_hints(validation_results, rdf_content)}"""
 def merge_corrected_sections(original_rdf: str, corrected_sections: dict) -> str:
     """
@@ -755,7 +611,7 @@ def get_ai_correction(validation_results: str, rdf_content: str, template: str =
         timeout = 120  # Increased to 120 second total timeout
         if steps_log is not None:
             steps_log.append(f"Timeout budget: {timeout}s total")
         severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
         # Try multiple attempts to generate valid RDF
@@ -773,9 +629,23 @@ def get_ai_correction(validation_results: str, rdf_content: str, template: str =
                 steps_log.append(f"Attempt {attempt_no}/{max_attempts}: requesting model correction")
             print(f"🔄 Correction attempt {attempt_no}/{max_attempts}")
             prompt = f"""You are an expert in RDF/XML. Fix the following RDF/XML based on the validation errors provided.
 {severity_instruction}
 Validation Errors:
 {validation_results}
@@ -796,13 +666,17 @@ Please provide the corrected RDF/XML that addresses all validation issues.
                 chat_completion = client.chat.completions.create(
                     model=HF_MODEL,
                     messages=[
                         {
                             "role": "user",
                             "content": prompt
                         }
                     ],
                     max_tokens=2000,
-                    temperature=0.3,
                     timeout=60  # Increased to 60 second timeout per API call
                 )

     return rdf_content[:1000]  # Fallback
+## [Removed duplicate get_ai_correction definition – unified below]
 def merge_corrected_sections(original_rdf: str, corrected_sections: dict) -> str:
     """
         timeout = 120  # Increased to 120 second total timeout
         if steps_log is not None:
             steps_log.append(f"Timeout budget: {timeout}s total")
         severity_instruction = "Fix only the violations (errors) and ignore any warnings." if not include_warnings else "Fix both violations and warnings."
         # Try multiple attempts to generate valid RDF
                 steps_log.append(f"Attempt {attempt_no}/{max_attempts}: requesting model correction")
             print(f"🔄 Correction attempt {attempt_no}/{max_attempts}")
+            # Targeted AdminMetadata guidance inferred from results text
+            needs_assigner = ("->bf:assigner" in validation_results) or (" bf:assigner" in validation_results)
+            admin_guidance = ""
+            if needs_assigner:
+                admin_guidance = """
+IMPORTANT: For each <bf:AdminMetadata>, ensure it has a direct child <bf:assigner>.
+Rules:
+- If <bf:agent rdf:resource=\"...\"/> exists, add <bf:assigner rdf:resource=\"...\"/> with the SAME URI.
+- Else if <bf:descriptionModifier rdf:resource=\"...\"/> exists, add <bf:assigner rdf:resource=\"...\"/> with the SAME URI.
+- Else if a <bf:identifiedBy> block contains <bf:assigner rdf:resource=\"...\"/>, copy that URI to a TOP-LEVEL <bf:assigner>.
+Keep all existing content; only add missing <bf:assigner> where required.
+"""
             prompt = f"""You are an expert in RDF/XML. Fix the following RDF/XML based on the validation errors provided.
 {severity_instruction}
+{admin_guidance}
 Validation Errors:
 {validation_results}
                 chat_completion = client.chat.completions.create(
                     model=HF_MODEL,
                     messages=[
+                        {
+                            "role": "system",
+                            "content": "Return only valid RDF/XML content. No prose, no markdown, no code fences, no explanations."
+                        },
                         {
                             "role": "user",
                             "content": prompt
                         }
                     ],
                     max_tokens=2000,
+                    temperature=0.0,
                     timeout=60  # Increased to 60 second timeout per API call
                 )