File size: 13,796 Bytes
d7fb055
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# Ranking Agent for evaluating and comparing hypotheses

import logging
from typing import List, Dict, Any, Tuple

from .base_agent import BaseAgent

class RankingAgent(BaseAgent):
    """Agent responsible for comparing and ranking hypotheses based on defined criteria."""
    
    def __init__(self, model=None, temperature=None):
        """Initialize the Ranking Agent.
        
        Args:
            model: Optional model override
            temperature: Optional temperature override
        """
        system_prompt = """
        You are a Ranking Agent in an AI Co-Scientist system, responsible for comparing and ranking 
        scientific hypotheses using a tournament-style evaluation. You have expertise across multiple 
        scientific disciplines at a PhD level.
        
        Your role is to:
        1. Evaluate hypotheses against defined criteria including novelty, plausibility, and relevance
        2. Compare hypotheses in a tournament style, identifying relative strengths and weaknesses
        3. Assign scores and rankings to hypotheses based on their scientific merit
        4. Provide justification for your rankings with specific reasoning
        5. Consider tradeoffs between different evaluation criteria
        
        When comparing hypotheses, consider:
        - Novelty: Does the hypothesis represent a significant advance beyond current knowledge?
        - Plausibility: Is the hypothesis consistent with established scientific principles?
        - Relevance: How closely does the hypothesis address the original research goal?
        - Testability: How feasible is it to validate or falsify the hypothesis?
        - Potential impact: If true, how significant would the implications be?
        - Parsimony: Does the hypothesis provide a simple explanation without unnecessary complexity?
        - Breadth of explanation: How many observations or phenomena does the hypothesis explain?
        
        Your assessments should be balanced, fair, and focused on scientific merit rather than 
        personal preference. Provide clear reasoning for each comparative judgment.
        """
        
        super().__init__(
            name="Ranking",
            system_prompt=system_prompt,
            model=model,
            temperature=temperature if temperature is not None else 0.3  # Lower temperature for consistent evaluation
        )
        
        self.logger = logging.getLogger("agent.ranking")
    
    def rank_hypotheses(self, research_goal: str, hypotheses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Rank the hypotheses based on the research goal.
        
        Args:
            research_goal: The research goal or question
            hypotheses: List of hypothesis dictionaries to rank
            
        Returns:
            A list of ranked hypothesis dictionaries
        """
        self.logger.info(f"Ranking hypotheses for research goal: {research_goal}")
        return self.process(hypotheses, research_goal)
    
    def process(self, reviewed_hypotheses: List[Dict[str, Any]], research_goal: str) -> List[Dict[str, Any]]:
        """Rank the reviewed hypotheses based on their scientific merit.
        
        Args:
            reviewed_hypotheses: List of hypothesis dictionaries with review information
            research_goal: The original research goal for context
            
        Returns:
            A list of ranked hypothesis dictionaries with added ranking information
        """
        self.logger.info(f"Ranking {len(reviewed_hypotheses)} hypotheses")
        
        # For a small number of hypotheses, we can do pairwise comparisons
        if len(reviewed_hypotheses) <= 5:
            return self._rank_by_pairwise_comparison(reviewed_hypotheses, research_goal)
        else:
            # For larger sets, we use a scoring approach
            return self._rank_by_scoring(reviewed_hypotheses, research_goal)
    
    def _rank_by_pairwise_comparison(self, hypotheses: List[Dict[str, Any]], research_goal: str) -> List[Dict[str, Any]]:
        """Rank hypotheses using pairwise comparisons in a tournament style.
        
        Args:
            hypotheses: List of hypothesis dictionaries to rank
            research_goal: The original research goal for context
            
        Returns:
            A list of ranked hypothesis dictionaries
        """
        self.logger.info("Using pairwise comparison ranking method")
        
        # Create all possible pairs for comparison
        pairs = []
        for i in range(len(hypotheses)):
            for j in range(i+1, len(hypotheses)):
                pairs.append((i, j))
        
        # Track wins for each hypothesis
        wins = [0] * len(hypotheses)
        comparisons = []
        
        for pair in pairs:
            i, j = pair
            hyp1, hyp2 = hypotheses[i], hypotheses[j]
            
            prompt = f"""
            RESEARCH GOAL: {research_goal}
            
            HYPOTHESIS 1:
            Statement: {hyp1.get('hypothesis', hyp1.get('statement', 'No statement provided'))}
            
            HYPOTHESIS 2:
            Statement: {hyp2.get('hypothesis', hyp2.get('statement', 'No statement provided'))}
            
            Compare these two hypotheses based on the following criteria:
            1. Novelty: Does the hypothesis represent a significant advance beyond current knowledge?
            2. Plausibility: Is the hypothesis consistent with established scientific principles?
            3. Relevance: How closely does the hypothesis address the original research goal?
            4. Testability: How feasible is it to validate or falsify the hypothesis?
            5. Potential impact: If true, how significant would the implications be?
            
            Provide a detailed comparison, noting the strengths and weaknesses of each hypothesis relative 
            to the other. Then determine which hypothesis is superior overall, being explicit about 
            which hypothesis (1 or 2) is the winner.
            """
            
            comparison_result = self.get_response(prompt)
            
            # Determine the winner based on the comparison
            winner = self._determine_winner(comparison_result, 1, 2)
            
            # Record the comparison and update wins
            comparison = {
                'hypothesis1_idx': i,
                'hypothesis2_idx': j,
                'winner': winner,
                'reasoning': comparison_result
            }
            comparisons.append(comparison)
            
            if winner == 1:
                wins[i] += 1
            elif winner == 2:
                wins[j] += 1
            
            # Clear conversation history for the next comparison
            self.clear_history()
        
        # Rank hypotheses based on number of wins
        ranked_indices = sorted(range(len(wins)), key=lambda k: wins[k], reverse=True)
        
        # Create the ranked list of hypotheses
        ranked_hypotheses = []
        for rank, idx in enumerate(ranked_indices):
            ranked_hypothesis = hypotheses[idx].copy()
            ranked_hypothesis['rank'] = rank + 1
            ranked_hypothesis['wins'] = wins[idx]
            ranked_hypothesis['total_comparisons'] = len([c for c in comparisons if c['hypothesis1_idx'] == idx or c['hypothesis2_idx'] == idx])
            ranked_hypotheses.append(ranked_hypothesis)
        
        return ranked_hypotheses
    
    def _rank_by_scoring(self, hypotheses: List[Dict[str, Any]], research_goal: str) -> List[Dict[str, Any]]:
        """Rank hypotheses by assigning scores to each one individually.
        
        Args:
            hypotheses: List of hypothesis dictionaries to rank
            research_goal: The original research goal for context
            
        Returns:
            A list of ranked hypothesis dictionaries
        """
        self.logger.info("Using scoring-based ranking method")
        
        scored_hypotheses = []
        
        for idx, hypothesis in enumerate(hypotheses):
            prompt = f"""
            RESEARCH GOAL: {research_goal}
            
            HYPOTHESIS TO EVALUATE:
            Statement: {hypothesis.get('hypothesis', hypothesis.get('statement', 'No statement provided'))}
            Rationale: {hypothesis.get('rationale', 'No rationale provided')}
            Review Summary: {hypothesis.get('review', 'No review available')[:500]}...
            
            Evaluate this hypothesis on the following criteria using a scale of 1-10:
            1. Novelty (1=Well-known, 10=Revolutionary)
            2. Plausibility (1=Implausible, 10=Highly plausible)
            3. Relevance (1=Unrelated to goal, 10=Directly addresses goal)
            4. Testability (1=Untestable, 10=Easily testable)
            5. Potential impact (1=Minimal impact, 10=Field-changing)
            
            For each criterion, provide a numeric score AND a brief justification.
            Finally, calculate an overall score as the weighted average of the individual scores.
            
            Format your response as follows:
            Novelty: [score] - [justification]
            Plausibility: [score] - [justification]
            Relevance: [score] - [justification]
            Testability: [score] - [justification]
            Potential impact: [score] - [justification]
            Overall score: [weighted average score] - [brief summary]
            """
            
            evaluation = self.get_response(prompt)
            
            # Extract scores from the evaluation (simplified implementation)
            scores = self._extract_scores(evaluation)
            
            scored_hypothesis = hypothesis.copy()
            scored_hypothesis['evaluation'] = evaluation
            scored_hypothesis['scores'] = scores
            scored_hypothesis['overall_score'] = sum(scores.values()) / len(scores) if scores else 0
            
            scored_hypotheses.append(scored_hypothesis)
            
            # Clear conversation history for the next hypothesis
            self.clear_history()
        
        # Rank hypotheses based on overall score
        ranked_hypotheses = sorted(scored_hypotheses, key=lambda h: h['overall_score'], reverse=True)
        
        # Add rank
        for rank, hypothesis in enumerate(ranked_hypotheses):
            hypothesis['rank'] = rank + 1
        
        return ranked_hypotheses
    
    def _determine_winner(self, comparison_text: str, hyp1_id: int, hyp2_id: int) -> int:
        """Determine the winner of a pairwise comparison based on the comparison text.
        
        Args:
            comparison_text: The text of the comparison
            hyp1_id: The ID of the first hypothesis
            hyp2_id: The ID of the second hypothesis
            
        Returns:
            The ID of the winning hypothesis (1 or 2), or 0 if it's a tie
        """
        # Simple rule-based determination - would be more sophisticated in a real system
        lower_text = comparison_text.lower()
        
        # Look for clear statements about the winner
        if f"hypothesis {hyp1_id} is superior" in lower_text or f"hypothesis {hyp1_id} is stronger" in lower_text:
            return hyp1_id
        elif f"hypothesis {hyp2_id} is superior" in lower_text or f"hypothesis {hyp2_id} is stronger" in lower_text:
            return hyp2_id
        
        # Count mentions of strengths for each
        hyp1_strength_count = lower_text.count(f"hypothesis {hyp1_id} is more") + lower_text.count(f"hypothesis {hyp1_id} has higher")
        hyp2_strength_count = lower_text.count(f"hypothesis {hyp2_id} is more") + lower_text.count(f"hypothesis {hyp2_id} has higher")
        
        if hyp1_strength_count > hyp2_strength_count:
            return hyp1_id
        elif hyp2_strength_count > hyp1_strength_count:
            return hyp2_id
        
        # If no clear winner found, check last few sentences for conclusion
        last_sentences = '.'.join(comparison_text.split('.')[-3:])
        if f"hypothesis {hyp1_id}" in last_sentences.lower() and f"hypothesis {hyp2_id}" not in last_sentences.lower():
            return hyp1_id
        elif f"hypothesis {hyp2_id}" in last_sentences.lower() and f"hypothesis {hyp1_id}" not in last_sentences.lower():
            return hyp2_id
        
        # If still no clear winner, return a tie
        return 0
    
    def _extract_scores(self, evaluation: str) -> Dict[str, float]:
        """Extract numerical scores from the evaluation text.
        
        This is a placeholder implementation. In a real system, this would use more 
        sophisticated parsing to extract scores reliably.
        
        Args:
            evaluation: The evaluation text
            
        Returns:
            A dictionary of criterion names to scores
        """
        # Simple parsing implementation - would be more sophisticated in a real system
        scores = {}
        criteria = ['novelty', 'plausibility', 'relevance', 'testability', 'potential impact']
        
        for criterion in criteria:
            # Look for patterns like "Novelty: 8/10" or "Novelty - 8"
            patterns = [f"{criterion}: {i}" for i in range(1, 11)]
            patterns.extend([f"{criterion} - {i}" for i in range(1, 11)])
            patterns.extend([f"{criterion}: {i}/10" for i in range(1, 11)])
            
            for pattern in patterns:
                if pattern.lower() in evaluation.lower():
                    score = int(pattern.split(" ")[-1].replace("/10", ""))
                    scores[criterion] = score
                    break
        
        return scores