"""Multi-agent system for literature review using OpenAI-compatible API.""" import json import re import os import time from typing import Any, Optional, Dict, Tuple from openai import OpenAI def extract_json_between_markers(llm_output: str) -> Optional[Dict[str, Any]]: """Extracts JSON content from a string, typically an LLM output.""" json_pattern = r"```json(.*?)```" matches = re.findall(json_pattern, llm_output, re.DOTALL) if not matches: json_pattern_fallback = r"\{[^{}]*\}" matches = re.findall(json_pattern_fallback, llm_output, re.DOTALL) for json_string in matches: json_string = json_string.strip() try: parsed_json = json.loads(json_string) return parsed_json except json.JSONDecodeError: try: json_string_clean = "".join( char for char in json_string if ord(char) >= 32 and ord(char) != 127 ) parsed_json = json.loads(json_string_clean) return parsed_json except json.JSONDecodeError: continue return None def query_model(system_prompt: str, prompt: str, client: OpenAI, model: str) -> Optional[str]: """Query the model with the given prompts using OpenAI-compatible API with rate limiting.""" try: # Rate limiting: 1 request per second to avoid concurrency issues time.sleep(1) response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt} ], temperature=0.7, max_tokens=4000 ) return response.choices[0].message.content except Exception as e: print(f"Error querying model: {e}") # Wait before retry time.sleep(2) return None def get_score( paper_content: str, reviewer_type: Optional[str] = None, attempts: int = 3, client: OpenAI = None, model: str = None, ) -> Tuple[Optional[float], str, bool]: """Evaluates a research paper using an LLM reviewer.""" last_exception_message = "" for attempt in range(attempts): try: template_instructions = """ Respond in the following format: THOUGHT: REVIEW JSON: ```json ``` In , first briefly discuss your intuitions and reasoning for the evaluation. Detail your high-level arguments, necessary choices and desired outcomes of the review. In , provide the review in JSON format with the following fields: - "Summary": A summary of the paper content and its contributions. - "Strengths": A list of strengths of the paper. - "Weaknesses": A list of weaknesses of the paper. - "Originality": A rating from 1 to 4 (low, medium, high, very high). - "Quality": A rating from 1 to 4 (low, medium, high, very high). - "Clarity": A rating from 1 to 4 (low, medium, high, very high). - "Significance": A rating from 1 to 4 (low, medium, high, very high). - "Questions": A set of clarifying questions to be answered by the paper authors. - "Limitations": A set of limitations and potential negative societal impacts. - "Ethical Concerns": A boolean value indicating whether there are ethical concerns. - "Soundness": A rating from 1 to 4 (poor, fair, good, excellent). - "Presentation": A rating from 1 to 4 (poor, fair, good, excellent). - "Contribution": A rating from 1 to 4 (poor, fair, good, excellent). - "Overall": A rating from 1 to 10 (very strong reject to award quality). - "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute). - "Decision": A decision that has to be one of: Accept, Reject. """ neurips_form = """ ## Review Guidelines Evaluate the paper across these dimensions: 1. **Originality**: Are the ideas novel? Is related work cited? 2. **Quality**: Is the work technically sound? Are claims well supported? 3. **Clarity**: Is the paper well-written and organized? 4. **Significance**: Are the results important? Will others build on this work? 5. **Soundness**: Rate the technical quality (1-4: poor, fair, good, excellent) 6. **Presentation**: Rate the writing quality (1-4: poor, fair, good, excellent) 7. **Contribution**: Rate the overall contribution (1-4: poor, fair, good, excellent) 8. **Overall Score**: Rate 1-10 where: - 1-3: Reject - 4-6: Borderline - 7-8: Accept - 9-10: Strong Accept """ + template_instructions if reviewer_type is None: reviewer_type = "" sys_prompt = ( f"You are an AI researcher reviewing an academic paper. " f"Be critical and thorough in your assessment. {reviewer_type}\n" ) + neurips_form prompt = f"Review the following paper:\n\n{paper_content}\n\n" review_output = query_model( system_prompt=sys_prompt, prompt=prompt, client=client, model=model, ) if review_output is None: raise ValueError("LLM query returned None.") review_json = extract_json_between_markers(review_output) if review_json is None: raise ValueError("Could not extract JSON review from LLM output.") required_keys = [ "Overall", "Soundness", "Confidence", "Contribution", "Presentation", "Clarity", "Originality", "Quality", "Significance", ] for key in required_keys: if key not in review_json: raise KeyError(f"Missing key '{key}' in review JSON.") # Calculate weighted score overall = int(review_json["Overall"]) / 10.0 soundness = int(review_json["Soundness"]) / 4.0 confidence = int(review_json["Confidence"]) / 5.0 contribution = int(review_json["Contribution"]) / 4.0 presentation = int(review_json["Presentation"]) / 4.0 clarity = int(review_json["Clarity"]) / 4.0 originality = int(review_json["Originality"]) / 4.0 quality = int(review_json["Quality"]) / 4.0 significance = int(review_json["Significance"]) / 4.0 weights = { "clarity": 0.1, "quality": 0.1, "overall": 1.0, "soundness": 0.1, "confidence": 0.1, "originality": 0.1, "significance": 0.1, "contribution": 0.4, "presentation": 0.2, } max_score = sum(weights.values()) performance = ( weights["soundness"] * soundness + weights["presentation"] * presentation + weights["confidence"] * confidence + weights["contribution"] * contribution + weights["overall"] * overall + weights["originality"] * originality + weights["significance"] * significance + weights["clarity"] * clarity + weights["quality"] * quality ) / max_score * 10.0 return ( performance, f"Performance Score: {performance:.2f}/10\n\n{review_output}", True, ) except Exception as e: print(f"Error in get_score (attempt {attempt + 1}/{attempts}): {e}") last_exception_message = str(e) return ( None, f"Failed to get score after {attempts} attempts. Last error: {last_exception_message}", False, ) class ReviewerAgent: """Agent that simulates a single reviewer with specific persona.""" def __init__(self, client: OpenAI, model: str, persona: str, name: str): self.client = client self.model = model self.persona = persona self.name = name def review_paper(self, paper_content: str) -> Dict[str, Any]: """Generate review for the paper.""" score, review_text, success = get_score( paper_content=paper_content, reviewer_type=self.persona, client=self.client, model=self.model, ) return { "reviewer": self.name, "score": score, "review": review_text, "success": success } class MultiReviewerSystem: """System that coordinates multiple reviewer agents.""" def __init__(self, api_key: str, base_url: str, model: str): self.client = OpenAI(api_key=api_key, base_url=base_url) self.model = model self.reviewers = [ ReviewerAgent( client=self.client, model=self.model, persona="You focus on experimental rigor and expect well-designed experiments with clear insights.", name="Reviewer 1: Experimentalist" ), ReviewerAgent( client=self.client, model=self.model, persona="You look for impactful ideas that would advance the field significantly.", name="Reviewer 2: Impactist" ), ReviewerAgent( client=self.client, model=self.model, persona="You seek novel ideas that have not been proposed before and creative approaches.", name="Reviewer 3: Novelty Seeker" ) ] def review_paper_sequential(self, paper_content: str, progress_callback=None) -> Dict[str, Any]: """Generate reviews from multiple reviewers sequentially.""" reviews = [] total_score = 0 successful_reviews = 0 for i, reviewer in enumerate(self.reviewers): if progress_callback: progress_callback(i / len(self.reviewers), f"Reviewing with {reviewer.name}...") review_result = reviewer.review_paper(paper_content) reviews.append(review_result) if review_result["success"] and review_result["score"] is not None: total_score += review_result["score"] successful_reviews += 1 avg_score = total_score / successful_reviews if successful_reviews > 0 else 0 if progress_callback: progress_callback(1.0, "Review complete!") return { "reviews": reviews, "average_score": avg_score, "total_reviewers": len(self.reviewers), "successful_reviews": successful_reviews }