PaperReview / agents.py
Nur Arifin Akbar
Add rate limiting and API key management
9c12608
"""Multi-agent system for literature review using OpenAI-compatible API."""
import json
import re
import os
import time
from typing import Any, Optional, Dict, Tuple
from openai import OpenAI
def extract_json_between_markers(llm_output: str) -> Optional[Dict[str, Any]]:
"""Extracts JSON content from a string, typically an LLM output."""
json_pattern = r"```json(.*?)```"
matches = re.findall(json_pattern, llm_output, re.DOTALL)
if not matches:
json_pattern_fallback = r"\{[^{}]*\}"
matches = re.findall(json_pattern_fallback, llm_output, re.DOTALL)
for json_string in matches:
json_string = json_string.strip()
try:
parsed_json = json.loads(json_string)
return parsed_json
except json.JSONDecodeError:
try:
json_string_clean = "".join(
char for char in json_string if ord(char) >= 32 and ord(char) != 127
)
parsed_json = json.loads(json_string_clean)
return parsed_json
except json.JSONDecodeError:
continue
return None
def query_model(system_prompt: str, prompt: str, client: OpenAI, model: str) -> Optional[str]:
"""Query the model with the given prompts using OpenAI-compatible API with rate limiting."""
try:
# Rate limiting: 1 request per second to avoid concurrency issues
time.sleep(1)
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
temperature=0.7,
max_tokens=4000
)
return response.choices[0].message.content
except Exception as e:
print(f"Error querying model: {e}")
# Wait before retry
time.sleep(2)
return None
def get_score(
paper_content: str,
reviewer_type: Optional[str] = None,
attempts: int = 3,
client: OpenAI = None,
model: str = None,
) -> Tuple[Optional[float], str, bool]:
"""Evaluates a research paper using an LLM reviewer."""
last_exception_message = ""
for attempt in range(attempts):
try:
template_instructions = """
Respond in the following format:
THOUGHT:
<THOUGHT>
REVIEW JSON:
```json
<JSON>
```
In <THOUGHT>, first briefly discuss your intuitions and reasoning for the evaluation.
Detail your high-level arguments, necessary choices and desired outcomes of the review.
In <JSON>, provide the review in JSON format with the following fields:
- "Summary": A summary of the paper content and its contributions.
- "Strengths": A list of strengths of the paper.
- "Weaknesses": A list of weaknesses of the paper.
- "Originality": A rating from 1 to 4 (low, medium, high, very high).
- "Quality": A rating from 1 to 4 (low, medium, high, very high).
- "Clarity": A rating from 1 to 4 (low, medium, high, very high).
- "Significance": A rating from 1 to 4 (low, medium, high, very high).
- "Questions": A set of clarifying questions to be answered by the paper authors.
- "Limitations": A set of limitations and potential negative societal impacts.
- "Ethical Concerns": A boolean value indicating whether there are ethical concerns.
- "Soundness": A rating from 1 to 4 (poor, fair, good, excellent).
- "Presentation": A rating from 1 to 4 (poor, fair, good, excellent).
- "Contribution": A rating from 1 to 4 (poor, fair, good, excellent).
- "Overall": A rating from 1 to 10 (very strong reject to award quality).
- "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute).
- "Decision": A decision that has to be one of: Accept, Reject.
"""
neurips_form = """
## Review Guidelines
Evaluate the paper across these dimensions:
1. **Originality**: Are the ideas novel? Is related work cited?
2. **Quality**: Is the work technically sound? Are claims well supported?
3. **Clarity**: Is the paper well-written and organized?
4. **Significance**: Are the results important? Will others build on this work?
5. **Soundness**: Rate the technical quality (1-4: poor, fair, good, excellent)
6. **Presentation**: Rate the writing quality (1-4: poor, fair, good, excellent)
7. **Contribution**: Rate the overall contribution (1-4: poor, fair, good, excellent)
8. **Overall Score**: Rate 1-10 where:
- 1-3: Reject
- 4-6: Borderline
- 7-8: Accept
- 9-10: Strong Accept
""" + template_instructions
if reviewer_type is None:
reviewer_type = ""
sys_prompt = (
f"You are an AI researcher reviewing an academic paper. "
f"Be critical and thorough in your assessment. {reviewer_type}\n"
) + neurips_form
prompt = f"Review the following paper:\n\n{paper_content}\n\n"
review_output = query_model(
system_prompt=sys_prompt,
prompt=prompt,
client=client,
model=model,
)
if review_output is None:
raise ValueError("LLM query returned None.")
review_json = extract_json_between_markers(review_output)
if review_json is None:
raise ValueError("Could not extract JSON review from LLM output.")
required_keys = [
"Overall", "Soundness", "Confidence", "Contribution",
"Presentation", "Clarity", "Originality", "Quality", "Significance",
]
for key in required_keys:
if key not in review_json:
raise KeyError(f"Missing key '{key}' in review JSON.")
# Calculate weighted score
overall = int(review_json["Overall"]) / 10.0
soundness = int(review_json["Soundness"]) / 4.0
confidence = int(review_json["Confidence"]) / 5.0
contribution = int(review_json["Contribution"]) / 4.0
presentation = int(review_json["Presentation"]) / 4.0
clarity = int(review_json["Clarity"]) / 4.0
originality = int(review_json["Originality"]) / 4.0
quality = int(review_json["Quality"]) / 4.0
significance = int(review_json["Significance"]) / 4.0
weights = {
"clarity": 0.1,
"quality": 0.1,
"overall": 1.0,
"soundness": 0.1,
"confidence": 0.1,
"originality": 0.1,
"significance": 0.1,
"contribution": 0.4,
"presentation": 0.2,
}
max_score = sum(weights.values())
performance = (
weights["soundness"] * soundness +
weights["presentation"] * presentation +
weights["confidence"] * confidence +
weights["contribution"] * contribution +
weights["overall"] * overall +
weights["originality"] * originality +
weights["significance"] * significance +
weights["clarity"] * clarity +
weights["quality"] * quality
) / max_score * 10.0
return (
performance,
f"Performance Score: {performance:.2f}/10\n\n{review_output}",
True,
)
except Exception as e:
print(f"Error in get_score (attempt {attempt + 1}/{attempts}): {e}")
last_exception_message = str(e)
return (
None,
f"Failed to get score after {attempts} attempts. Last error: {last_exception_message}",
False,
)
class ReviewerAgent:
"""Agent that simulates a single reviewer with specific persona."""
def __init__(self, client: OpenAI, model: str, persona: str, name: str):
self.client = client
self.model = model
self.persona = persona
self.name = name
def review_paper(self, paper_content: str) -> Dict[str, Any]:
"""Generate review for the paper."""
score, review_text, success = get_score(
paper_content=paper_content,
reviewer_type=self.persona,
client=self.client,
model=self.model,
)
return {
"reviewer": self.name,
"score": score,
"review": review_text,
"success": success
}
class MultiReviewerSystem:
"""System that coordinates multiple reviewer agents."""
def __init__(self, api_key: str, base_url: str, model: str):
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.model = model
self.reviewers = [
ReviewerAgent(
client=self.client,
model=self.model,
persona="You focus on experimental rigor and expect well-designed experiments with clear insights.",
name="Reviewer 1: Experimentalist"
),
ReviewerAgent(
client=self.client,
model=self.model,
persona="You look for impactful ideas that would advance the field significantly.",
name="Reviewer 2: Impactist"
),
ReviewerAgent(
client=self.client,
model=self.model,
persona="You seek novel ideas that have not been proposed before and creative approaches.",
name="Reviewer 3: Novelty Seeker"
)
]
def review_paper_sequential(self, paper_content: str, progress_callback=None) -> Dict[str, Any]:
"""Generate reviews from multiple reviewers sequentially."""
reviews = []
total_score = 0
successful_reviews = 0
for i, reviewer in enumerate(self.reviewers):
if progress_callback:
progress_callback(i / len(self.reviewers), f"Reviewing with {reviewer.name}...")
review_result = reviewer.review_paper(paper_content)
reviews.append(review_result)
if review_result["success"] and review_result["score"] is not None:
total_score += review_result["score"]
successful_reviews += 1
avg_score = total_score / successful_reviews if successful_reviews > 0 else 0
if progress_callback:
progress_callback(1.0, "Review complete!")
return {
"reviews": reviews,
"average_score": avg_score,
"total_reviewers": len(self.reviewers),
"successful_reviews": successful_reviews
}