Spaces:

TilanB
/

SmartDocAI

Sleeping

File size: 16,346 Bytes

"""
Verification agent module for answer validation against source documents.
"""
from typing import Dict, List, Optional, Literal
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel, Field
import logging

from configuration.parameters import parameters

logger = logging.getLogger(__name__)


class VerificationResult(BaseModel):
    """Structured output model for verification results."""
    
    supported: Literal["YES", "NO", "PARTIAL"] = Field(
        description="Whether the answer is supported by the context"
    )
    confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
        default="MEDIUM",
        description="Confidence level in the verification result"
    )
    unsupported_claims: List[str] = Field(
        default_factory=list,
        description="Claims not supported by context"
    )
    contradictions: List[str] = Field(
        default_factory=list,
        description="Contradictions between answer and context"
    )
    relevant: Literal["YES", "NO"] = Field(
        description="Whether the answer is relevant to the question"
    )
    completeness: Literal["COMPLETE", "PARTIAL", "INCOMPLETE"] = Field(
        default="PARTIAL",
        description="How completely the answer addresses the question"
    )
    additional_details: str = Field(
        default="",
        description="Additional explanations and reasoning"
    )


class BestAnswerSelection(BaseModel):
    """Structured output model for selecting the best answer from candidates."""
    
    selected_index: int = Field(
        description="The index (0-based) of the best answer from the candidates list"
    )
    reasoning: str = Field(
        description="Explanation of why this answer was selected as the best"
    )
    confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
        default="MEDIUM",
        description="Confidence level in the selection"
    )
    comparison_summary: str = Field(
        default="",
        description="Brief comparison of the candidate answers"
    )


class VerificationAgent:
    """Agent for verifying answers against source documents."""

    def __init__(
        self,
        llm: Optional[ChatGoogleGenerativeAI] = None,
        max_context_chars: int = None,
        max_output_tokens: int = None,
    ) -> None:
        """Initialize the verification agent."""
        logger.info("Initializing VerificationAgent...")

        self.max_context_chars = max_context_chars or parameters.VERIFICATION_MAX_CONTEXT_CHARS
        self.max_output_tokens = max_output_tokens or parameters.VERIFICATION_MAX_OUTPUT_TOKENS
        
        base_llm = llm or ChatGoogleGenerativeAI(
            model=parameters.VERIFICATION_AGENT_MODEL,
            google_api_key=parameters.GOOGLE_API_KEY,
            temperature=0,
            max_output_tokens=self.max_output_tokens,
        )
        
        self.llm = base_llm
        self.structured_llm = base_llm.with_structured_output(VerificationResult)
        self.selection_llm = base_llm.with_structured_output(BestAnswerSelection)
        
        logger.info(f"VerificationAgent initialized (model={parameters.VERIFICATION_AGENT_MODEL})")

    def generate_prompt(self, answer: str, context: str, question: Optional[str] = None) -> str:
        """Generate verification prompt."""
        question_section = f"\n**Original Question:** {question}\n" if question else ""
        
        return f"""Verify the following answer against the provided context.

**Check for:**
1. Factual support (YES/NO/PARTIAL)
2. Confidence level (HIGH/MEDIUM/LOW)
3. Unsupported claims
4. Contradictions
5. Relevance to question
6. Completeness (COMPLETE/PARTIAL/INCOMPLETE)

**Scoring:**
- HIGH: All claims directly stated, no ambiguity
- MEDIUM: Most claims supported, some inferred
- LOW: Significant claims unsupported
{question_section}
**Answer to Verify:**
{answer}

**Context:**
{context}

Provide your verification analysis."""

    def format_verification_report(self, verification: VerificationResult) -> str:
        """Format verification result into readable report."""
        report = f"**Supported:** {verification.supported}\n"
        report += f"**Confidence:** {verification.confidence}\n"
        report += f"**Unsupported Claims:** {', '.join(verification.unsupported_claims) or 'None'}\n"
        report += f"**Contradictions:** {', '.join(verification.contradictions) or 'None'}\n"
        report += f"**Relevant:** {verification.relevant}\n"
        report += f"**Completeness:** {verification.completeness}\n"
        report += f"**Additional Details:** {verification.additional_details or 'None'}\n"
        return report

    def generate_feedback_for_research(self, verification: VerificationResult) -> Optional[str]:
        """Generate feedback for research agent if improvements needed."""
        feedback_parts = []
        if verification.supported == "NO":
            feedback_parts.append("Answer lacks sufficient support from documents.")
        elif verification.supported == "PARTIAL":
            feedback_parts.append("Some parts are not well supported.")
        if verification.unsupported_claims:
            claims_str = "; ".join(verification.unsupported_claims[:3])
            feedback_parts.append(f"Unsupported: {claims_str}")
        if verification.contradictions:
            contradictions_str = "; ".join(verification.contradictions[:3])
            feedback_parts.append(f"Contradictions: {contradictions_str}")
        if verification.completeness == "INCOMPLETE":
            feedback_parts.append("Answer is incomplete.")
        if verification.confidence == "LOW":
            feedback_parts.append("Focus on directly verifiable claims.")
        # Always add additional_details if present, even if other feedback exists
        if verification.additional_details:
            feedback_parts.append(f"Additional Details: {verification.additional_details}")
        return " | ".join(feedback_parts) if feedback_parts else None

    def should_retry_research(self, verification: VerificationResult, verification_report: str = None, feedback: Optional[str] = None) -> bool:
        """Determine if research should be retried."""
        # Use structured fields first
        if verification.supported == "NO" or verification.relevant == "NO":
            return True
        if verification.confidence == "LOW" and (
            verification.unsupported_claims or verification.contradictions
        ):
            return True
        if verification.supported == "PARTIAL" and verification.contradictions:
            return True
        # Also check verification_report string for extra signals (legacy/fallback)
        if verification_report:
            if "Supported: NO" in verification_report:
                logger.warning("[Re-Research] Answer not supported; triggering re-research.")
                return True
            elif "Relevant: NO" in verification_report:
                logger.warning("[Re-Research] Answer not relevant; triggering re-research.")
                return True
            elif "Confidence: LOW" in verification_report and "Supported: PARTIAL" in verification_report:
                logger.warning("[Re-Research] Low confidence with partial support; triggering re-research.")
                return True
            elif "Completeness: INCOMPLETE" in verification_report:
                logger.warning("[Re-Research] Answer is incomplete; triggering re-research.")
                return True
            elif "Completeness: PARTIAL" in verification_report:
                logger.warning("[Re-Research] Answer is partially complete; triggering re-research.")
                return True
        # Check feedback for contradiction/unsupported
        if feedback and ("contradiction" in feedback.lower() or "unsupported" in feedback.lower()):
            logger.warning("[Re-Research] Feedback indicates contradiction/unsupported; triggering re-research.")
            return True
        return False

    def check(self, answer: str, documents: List[Document], question: Optional[str] = None) -> Dict:
        """
        Verify answer against provided documents.
        
        Args:
            answer: The answer to verify
            documents: Source documents for verification
            question: Optional original question
            
        Returns:
            Dict with verification report, context, and metadata
        """
        logger.info(f"Verifying answer ({len(answer)} chars) against {len(documents)} documents")

        context = "\n\n".join([doc.page_content for doc in documents])
        
        if len(context) > self.max_context_chars:
            logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
            context = context[:self.max_context_chars]

        prompt = self.generate_prompt(answer, context, question)

        try:
            logger.debug("Calling LLM for verification...")
            verification_result: VerificationResult = self.structured_llm.invoke(prompt)
            logger.info(f"Verification: {verification_result.supported} ({verification_result.confidence})")
            
        except Exception as e:
            logger.error(f"Structured output failed: {e}")
            
            try:
                response = self.llm.invoke(prompt)
                report = response.content if hasattr(response, "content") else str(response)
                verification_result = self._parse_unstructured_response(report.strip())
            except Exception as fallback_error:
                logger.error(f"Fallback failed: {fallback_error}")
                verification_result = VerificationResult(
                    supported="NO",
                    confidence="LOW",
                    relevant="NO",
                    completeness="INCOMPLETE",
                    additional_details=f"Verification failed: {str(e)}"
                )

        verification_report = self.format_verification_report(verification_result)
        feedback = self.generate_feedback_for_research(verification_result)
        
        if feedback:
            logger.debug(f"Generated feedback: {feedback[:80]}...")

        return {
            "verification_report": verification_report,
            "context_used": context,
            "structured_result": verification_result.model_dump(),
            "should_retry": self.should_retry_research(verification_result, verification_report, feedback),
            "feedback": feedback
        }

    def select_best_answer(
        self, 
        candidate_answers: List[str], 
        documents: List[Document], 
        question: str
    ) -> Dict:
        """
        Select the best answer from multiple candidates based on verification criteria.
        
        Args:
            candidate_answers: List of candidate answers to evaluate
            documents: Source documents for verification
            question: The original question
            
        Returns:
            Dict with selected answer, index, reasoning, and verification details
        """
        logger.info(f"Selecting best answer from {len(candidate_answers)} candidates")
        
        if len(candidate_answers) == 0:
            logger.warning("No candidate answers provided")
            return {
                "selected_answer": "No answers were generated.",
                "selected_index": -1,
                "reasoning": "No candidates available",
                "confidence": "LOW"
            }
        
        if len(candidate_answers) == 1:
            logger.info("Only one candidate, returning it directly")
            return {
                "selected_answer": candidate_answers[0],
                "selected_index": 0,
                "reasoning": "Only one candidate answer was provided",
                "confidence": "MEDIUM"
            }
        
        context = "\n\n".join([doc.page_content for doc in documents])
        if len(context) > self.max_context_chars:
            logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
            context = context[:self.max_context_chars]
        
        candidates_text = ""
        for i, answer in enumerate(candidate_answers):
            candidates_text += f"\n**Candidate {i + 1}:**\n{answer}\n"
        
        prompt = f"""You are evaluating multiple candidate answers to select the best one.

**Original Question:** {question}

**Candidate Answers:**
{candidates_text}

**Source Context:**
{context}

**Evaluation Criteria:**
1. **Factual Accuracy**: Which answer is most accurately supported by the context?
2. **Completeness**: Which answer most thoroughly addresses the question?
3. **Relevance**: Which answer stays most focused on what was asked?
4. **No Contradictions**: Which answer has the fewest contradictions with the source?
5. **Clarity**: Which answer is clearest and most well-structured?

Select the best answer by providing its index (0-based) and explain your reasoning."""

        try:
            logger.debug("Calling LLM for best answer selection...")
            selection_result: BestAnswerSelection = self.selection_llm.invoke(prompt)
            
            selected_index = selection_result.selected_index
            if selected_index < 0 or selected_index >= len(candidate_answers):
                logger.warning(f"Invalid selection index {selected_index}, defaulting to 0")
                selected_index = 0
            
            logger.info(f"Selected candidate {selected_index + 1} with {selection_result.confidence} confidence")
            
            return {
                "selected_answer": candidate_answers[selected_index],
                "selected_index": selected_index,
                "reasoning": selection_result.reasoning,
                "confidence": selection_result.confidence,
                "comparison_summary": selection_result.comparison_summary
            }
            
        except Exception as e:
            logger.error(f"Best answer selection failed: {e}")
            # Fallback: return the first candidate
            return {
                "selected_answer": candidate_answers[0],
                "selected_index": 0,
                "reasoning": f"Selection failed, using first candidate: {str(e)}",
                "confidence": "LOW"
            }

    def _parse_unstructured_response(self, response_text: str) -> VerificationResult:
        """Parse unstructured response into VerificationResult (fallback)."""
        try:
            data = {
                "supported": "NO",
                "confidence": "LOW",
                "unsupported_claims": [],
                "contradictions": [],
                "relevant": "NO",
                "completeness": "INCOMPLETE",
                "additional_details": ""
            }
            
            for line in response_text.split('\n'):
                if ':' not in line:
                    continue
                key, value = line.split(':', 1)
                key = key.strip().lower().replace(' ', '_')
                value = value.strip().upper()
                
                if key == "supported":
                    data["supported"] = "YES" if "YES" in value else ("PARTIAL" if "PARTIAL" in value else "NO")
                elif key == "confidence":
                    data["confidence"] = "HIGH" if "HIGH" in value else ("MEDIUM" if "MEDIUM" in value else "LOW")
                elif key == "relevant":
                    data["relevant"] = "YES" if "YES" in value else "NO"
                elif key == "completeness":
                    if "COMPLETE" in value and "INCOMPLETE" not in value:
                        data["completeness"] = "COMPLETE"
                    elif "PARTIAL" in value:
                        data["completeness"] = "PARTIAL"
            
            return VerificationResult(**data)
        except Exception as e:
            logger.error(f"Failed to parse response: {e}")
            return VerificationResult(
                supported="NO",
                confidence="LOW",
                relevant="NO",
                completeness="INCOMPLETE",
                additional_details="Failed to parse verification response"
            )