File size: 16,346 Bytes
50fcf88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a7fd26
50fcf88
2a7fd26
50fcf88
 
 
 
 
 
 
 
2a7fd26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50fcf88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a7fd26
50fcf88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a7fd26
50fcf88
2a7fd26
50fcf88
2a7fd26
50fcf88
2a7fd26
50fcf88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
"""
Verification agent module for answer validation against source documents.
"""
from typing import Dict, List, Optional, Literal
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel, Field
import logging

from configuration.parameters import parameters

logger = logging.getLogger(__name__)


class VerificationResult(BaseModel):
    """Structured output model for verification results."""
    
    supported: Literal["YES", "NO", "PARTIAL"] = Field(
        description="Whether the answer is supported by the context"
    )
    confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
        default="MEDIUM",
        description="Confidence level in the verification result"
    )
    unsupported_claims: List[str] = Field(
        default_factory=list,
        description="Claims not supported by context"
    )
    contradictions: List[str] = Field(
        default_factory=list,
        description="Contradictions between answer and context"
    )
    relevant: Literal["YES", "NO"] = Field(
        description="Whether the answer is relevant to the question"
    )
    completeness: Literal["COMPLETE", "PARTIAL", "INCOMPLETE"] = Field(
        default="PARTIAL",
        description="How completely the answer addresses the question"
    )
    additional_details: str = Field(
        default="",
        description="Additional explanations and reasoning"
    )


class BestAnswerSelection(BaseModel):
    """Structured output model for selecting the best answer from candidates."""
    
    selected_index: int = Field(
        description="The index (0-based) of the best answer from the candidates list"
    )
    reasoning: str = Field(
        description="Explanation of why this answer was selected as the best"
    )
    confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
        default="MEDIUM",
        description="Confidence level in the selection"
    )
    comparison_summary: str = Field(
        default="",
        description="Brief comparison of the candidate answers"
    )


class VerificationAgent:
    """Agent for verifying answers against source documents."""

    def __init__(
        self,
        llm: Optional[ChatGoogleGenerativeAI] = None,
        max_context_chars: int = None,
        max_output_tokens: int = None,
    ) -> None:
        """Initialize the verification agent."""
        logger.info("Initializing VerificationAgent...")

        self.max_context_chars = max_context_chars or parameters.VERIFICATION_MAX_CONTEXT_CHARS
        self.max_output_tokens = max_output_tokens or parameters.VERIFICATION_MAX_OUTPUT_TOKENS
        
        base_llm = llm or ChatGoogleGenerativeAI(
            model=parameters.VERIFICATION_AGENT_MODEL,
            google_api_key=parameters.GOOGLE_API_KEY,
            temperature=0,
            max_output_tokens=self.max_output_tokens,
        )
        
        self.llm = base_llm
        self.structured_llm = base_llm.with_structured_output(VerificationResult)
        self.selection_llm = base_llm.with_structured_output(BestAnswerSelection)
        
        logger.info(f"VerificationAgent initialized (model={parameters.VERIFICATION_AGENT_MODEL})")

    def generate_prompt(self, answer: str, context: str, question: Optional[str] = None) -> str:
        """Generate verification prompt."""
        question_section = f"\n**Original Question:** {question}\n" if question else ""
        
        return f"""Verify the following answer against the provided context.

**Check for:**
1. Factual support (YES/NO/PARTIAL)
2. Confidence level (HIGH/MEDIUM/LOW)
3. Unsupported claims
4. Contradictions
5. Relevance to question
6. Completeness (COMPLETE/PARTIAL/INCOMPLETE)

**Scoring:**
- HIGH: All claims directly stated, no ambiguity
- MEDIUM: Most claims supported, some inferred
- LOW: Significant claims unsupported
{question_section}
**Answer to Verify:**
{answer}

**Context:**
{context}

Provide your verification analysis."""

    def format_verification_report(self, verification: VerificationResult) -> str:
        """Format verification result into readable report."""
        report = f"**Supported:** {verification.supported}\n"
        report += f"**Confidence:** {verification.confidence}\n"
        report += f"**Unsupported Claims:** {', '.join(verification.unsupported_claims) or 'None'}\n"
        report += f"**Contradictions:** {', '.join(verification.contradictions) or 'None'}\n"
        report += f"**Relevant:** {verification.relevant}\n"
        report += f"**Completeness:** {verification.completeness}\n"
        report += f"**Additional Details:** {verification.additional_details or 'None'}\n"
        return report

    def generate_feedback_for_research(self, verification: VerificationResult) -> Optional[str]:
        """Generate feedback for research agent if improvements needed."""
        feedback_parts = []
        if verification.supported == "NO":
            feedback_parts.append("Answer lacks sufficient support from documents.")
        elif verification.supported == "PARTIAL":
            feedback_parts.append("Some parts are not well supported.")
        if verification.unsupported_claims:
            claims_str = "; ".join(verification.unsupported_claims[:3])
            feedback_parts.append(f"Unsupported: {claims_str}")
        if verification.contradictions:
            contradictions_str = "; ".join(verification.contradictions[:3])
            feedback_parts.append(f"Contradictions: {contradictions_str}")
        if verification.completeness == "INCOMPLETE":
            feedback_parts.append("Answer is incomplete.")
        if verification.confidence == "LOW":
            feedback_parts.append("Focus on directly verifiable claims.")
        # Always add additional_details if present, even if other feedback exists
        if verification.additional_details:
            feedback_parts.append(f"Additional Details: {verification.additional_details}")
        return " | ".join(feedback_parts) if feedback_parts else None

    def should_retry_research(self, verification: VerificationResult, verification_report: str = None, feedback: Optional[str] = None) -> bool:
        """Determine if research should be retried."""
        # Use structured fields first
        if verification.supported == "NO" or verification.relevant == "NO":
            return True
        if verification.confidence == "LOW" and (
            verification.unsupported_claims or verification.contradictions
        ):
            return True
        if verification.supported == "PARTIAL" and verification.contradictions:
            return True
        # Also check verification_report string for extra signals (legacy/fallback)
        if verification_report:
            if "Supported: NO" in verification_report:
                logger.warning("[Re-Research] Answer not supported; triggering re-research.")
                return True
            elif "Relevant: NO" in verification_report:
                logger.warning("[Re-Research] Answer not relevant; triggering re-research.")
                return True
            elif "Confidence: LOW" in verification_report and "Supported: PARTIAL" in verification_report:
                logger.warning("[Re-Research] Low confidence with partial support; triggering re-research.")
                return True
            elif "Completeness: INCOMPLETE" in verification_report:
                logger.warning("[Re-Research] Answer is incomplete; triggering re-research.")
                return True
            elif "Completeness: PARTIAL" in verification_report:
                logger.warning("[Re-Research] Answer is partially complete; triggering re-research.")
                return True
        # Check feedback for contradiction/unsupported
        if feedback and ("contradiction" in feedback.lower() or "unsupported" in feedback.lower()):
            logger.warning("[Re-Research] Feedback indicates contradiction/unsupported; triggering re-research.")
            return True
        return False

    def check(self, answer: str, documents: List[Document], question: Optional[str] = None) -> Dict:
        """
        Verify answer against provided documents.
        
        Args:
            answer: The answer to verify
            documents: Source documents for verification
            question: Optional original question
            
        Returns:
            Dict with verification report, context, and metadata
        """
        logger.info(f"Verifying answer ({len(answer)} chars) against {len(documents)} documents")

        context = "\n\n".join([doc.page_content for doc in documents])
        
        if len(context) > self.max_context_chars:
            logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
            context = context[:self.max_context_chars]

        prompt = self.generate_prompt(answer, context, question)

        try:
            logger.debug("Calling LLM for verification...")
            verification_result: VerificationResult = self.structured_llm.invoke(prompt)
            logger.info(f"Verification: {verification_result.supported} ({verification_result.confidence})")
            
        except Exception as e:
            logger.error(f"Structured output failed: {e}")
            
            try:
                response = self.llm.invoke(prompt)
                report = response.content if hasattr(response, "content") else str(response)
                verification_result = self._parse_unstructured_response(report.strip())
            except Exception as fallback_error:
                logger.error(f"Fallback failed: {fallback_error}")
                verification_result = VerificationResult(
                    supported="NO",
                    confidence="LOW",
                    relevant="NO",
                    completeness="INCOMPLETE",
                    additional_details=f"Verification failed: {str(e)}"
                )

        verification_report = self.format_verification_report(verification_result)
        feedback = self.generate_feedback_for_research(verification_result)
        
        if feedback:
            logger.debug(f"Generated feedback: {feedback[:80]}...")

        return {
            "verification_report": verification_report,
            "context_used": context,
            "structured_result": verification_result.model_dump(),
            "should_retry": self.should_retry_research(verification_result, verification_report, feedback),
            "feedback": feedback
        }

    def select_best_answer(
        self, 
        candidate_answers: List[str], 
        documents: List[Document], 
        question: str
    ) -> Dict:
        """
        Select the best answer from multiple candidates based on verification criteria.
        
        Args:
            candidate_answers: List of candidate answers to evaluate
            documents: Source documents for verification
            question: The original question
            
        Returns:
            Dict with selected answer, index, reasoning, and verification details
        """
        logger.info(f"Selecting best answer from {len(candidate_answers)} candidates")
        
        if len(candidate_answers) == 0:
            logger.warning("No candidate answers provided")
            return {
                "selected_answer": "No answers were generated.",
                "selected_index": -1,
                "reasoning": "No candidates available",
                "confidence": "LOW"
            }
        
        if len(candidate_answers) == 1:
            logger.info("Only one candidate, returning it directly")
            return {
                "selected_answer": candidate_answers[0],
                "selected_index": 0,
                "reasoning": "Only one candidate answer was provided",
                "confidence": "MEDIUM"
            }
        
        context = "\n\n".join([doc.page_content for doc in documents])
        if len(context) > self.max_context_chars:
            logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
            context = context[:self.max_context_chars]
        
        candidates_text = ""
        for i, answer in enumerate(candidate_answers):
            candidates_text += f"\n**Candidate {i + 1}:**\n{answer}\n"
        
        prompt = f"""You are evaluating multiple candidate answers to select the best one.

**Original Question:** {question}

**Candidate Answers:**
{candidates_text}

**Source Context:**
{context}

**Evaluation Criteria:**
1. **Factual Accuracy**: Which answer is most accurately supported by the context?
2. **Completeness**: Which answer most thoroughly addresses the question?
3. **Relevance**: Which answer stays most focused on what was asked?
4. **No Contradictions**: Which answer has the fewest contradictions with the source?
5. **Clarity**: Which answer is clearest and most well-structured?

Select the best answer by providing its index (0-based) and explain your reasoning."""

        try:
            logger.debug("Calling LLM for best answer selection...")
            selection_result: BestAnswerSelection = self.selection_llm.invoke(prompt)
            
            selected_index = selection_result.selected_index
            if selected_index < 0 or selected_index >= len(candidate_answers):
                logger.warning(f"Invalid selection index {selected_index}, defaulting to 0")
                selected_index = 0
            
            logger.info(f"Selected candidate {selected_index + 1} with {selection_result.confidence} confidence")
            
            return {
                "selected_answer": candidate_answers[selected_index],
                "selected_index": selected_index,
                "reasoning": selection_result.reasoning,
                "confidence": selection_result.confidence,
                "comparison_summary": selection_result.comparison_summary
            }
            
        except Exception as e:
            logger.error(f"Best answer selection failed: {e}")
            # Fallback: return the first candidate
            return {
                "selected_answer": candidate_answers[0],
                "selected_index": 0,
                "reasoning": f"Selection failed, using first candidate: {str(e)}",
                "confidence": "LOW"
            }

    def _parse_unstructured_response(self, response_text: str) -> VerificationResult:
        """Parse unstructured response into VerificationResult (fallback)."""
        try:
            data = {
                "supported": "NO",
                "confidence": "LOW",
                "unsupported_claims": [],
                "contradictions": [],
                "relevant": "NO",
                "completeness": "INCOMPLETE",
                "additional_details": ""
            }
            
            for line in response_text.split('\n'):
                if ':' not in line:
                    continue
                key, value = line.split(':', 1)
                key = key.strip().lower().replace(' ', '_')
                value = value.strip().upper()
                
                if key == "supported":
                    data["supported"] = "YES" if "YES" in value else ("PARTIAL" if "PARTIAL" in value else "NO")
                elif key == "confidence":
                    data["confidence"] = "HIGH" if "HIGH" in value else ("MEDIUM" if "MEDIUM" in value else "LOW")
                elif key == "relevant":
                    data["relevant"] = "YES" if "YES" in value else "NO"
                elif key == "completeness":
                    if "COMPLETE" in value and "INCOMPLETE" not in value:
                        data["completeness"] = "COMPLETE"
                    elif "PARTIAL" in value:
                        data["completeness"] = "PARTIAL"
            
            return VerificationResult(**data)
        except Exception as e:
            logger.error(f"Failed to parse response: {e}")
            return VerificationResult(
                supported="NO",
                confidence="LOW",
                relevant="NO",
                completeness="INCOMPLETE",
                additional_details="Failed to parse verification response"
            )