File size: 4,761 Bytes
b0c0df0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re
from typing import Any, Dict, Optional, Tuple, Union

from .prompt import (
    BINARY_JUDGE_PROMPT,
    COMPARATIVE_JUDGE_PROMPT,
    CORRECTNESS_JUDGE_PROMPT,
)


class JudgePromptBuilder:
    """Helper class to build prompts for different judge types"""

    @staticmethod
    def build_binary_prompt(question: str, answer: str, prediction: str, output_format: str = "0/1", custom_prompt: Optional[str] = None, **kwargs) -> str:
        """Build prompt for binary evaluation"""
        if custom_prompt:
            return custom_prompt.format(question=question, answer=answer, pred=prediction, prediction=prediction, **kwargs)

        positive, negative = ("1", "0") if output_format == "0/1" or output_format == "1/0" else ("Yes", "No")

        return BINARY_JUDGE_PROMPT.format(question=question, answer=answer, prediction=prediction, positive=positive, negative=negative)

    @staticmethod
    def build_comparative_prompt(
        question: str, response1: str, response2: str, context: Optional[str] = None, score_range: Tuple[int, int] = (1, 10), custom_prompt: Optional[str] = None, evaluation_instruction: Optional[str] = None, **kwargs
    ) -> str:
        """Build prompt for comparative evaluation"""
        if custom_prompt:
            return custom_prompt.format(question=question, response1=response1, response2=response2, context=context or "", **kwargs)

        context_section = f"[Context]\n{context}\n\n" if context else ""

        if not evaluation_instruction:
            evaluation_instruction = f"Please provide scores from {score_range[0]} to {score_range[1]}."

        return COMPARATIVE_JUDGE_PROMPT.format(question=question, response1=response1, response2=response2, context_section=context_section, min_score=score_range[0], max_score=score_range[1], evaluation_instruction=evaluation_instruction)

    @staticmethod
    def build_correctness_prompt(question: str, answer: str, prediction: str, output_format: str = "yes/no", **kwargs) -> str:
        """Build prompt for correctness evaluation"""
        positive, negative = ("Yes", "No") if output_format == "yes/no" else ("1", "0")

        return CORRECTNESS_JUDGE_PROMPT.format(question=question, answer=answer, prediction=prediction, positive=positive, negative=negative)


class ResponseParser:
    """Helper class to parse different types of judge responses"""

    @staticmethod
    def parse_binary_response(response: str, output_format: str = "0/1") -> Union[int, bool]:
        """Parse binary response (0/1 or yes/no)"""
        response = response.strip().lower()

        if output_format == "0/1" or output_format == "1/0":
            # Check for various formats of 1
            if any(pattern in response for pattern in ["1", "[1]", "score: 1", "answer: 1"]):
                return 1
            else:
                return 0
        else:
            # yes/no format
            return response == "yes" or response.startswith("yes")

    @staticmethod
    def parse_score_response(response: str, score_range: Optional[Tuple[float, float]] = None) -> float:
        """Parse a single score from response"""
        try:
            # Try to extract first number from response
            numbers = re.findall(r"-?\d+(?:\.\d+)?", response)
            if numbers:
                score = float(numbers[0])
                # Clamp to valid range if provided
                if score_range:
                    score = max(score_range[0], min(score, score_range[1]))
                return score
        except Exception as e:
            pass

        # Return minimum score as default
        return score_range[0] if score_range else 0.0

    @staticmethod
    def parse_comparative_response(response: str) -> Tuple[float, float]:
        """Parse comparative scores from response"""
        try:
            # Extract scores from first line
            lines = response.strip().split("\n")
            if lines:
                score_line = lines[0]
                # Handle different separators
                score_line = score_line.replace(",", " ").replace(";", " ")
                scores = re.findall(r"-?\d+(?:\.\d+)?", score_line)

                if len(scores) >= 2:
                    return float(scores[0]), float(scores[1])
        except Exception as e:
            pass

        return -1.0, -1.0

    @staticmethod
    def parse_json_response(response: str) -> Dict[str, Any]:
        """Parse JSON response"""
        try:
            # Try to extract JSON from response
            json_match = re.search(r"\{.*\}", response, re.DOTALL)
            if json_match:
                import json

                return json.loads(json_match.group())
        except Exception as e:
            pass

        return {}