Spaces:
Sleeping
Sleeping
File size: 11,157 Bytes
9a4a0bb 9c12608 9a4a0bb 9c12608 9a4a0bb 9c12608 9a4a0bb 9c12608 9a4a0bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 |
"""Multi-agent system for literature review using OpenAI-compatible API."""
import json
import re
import os
import time
from typing import Any, Optional, Dict, Tuple
from openai import OpenAI
def extract_json_between_markers(llm_output: str) -> Optional[Dict[str, Any]]:
"""Extracts JSON content from a string, typically an LLM output."""
json_pattern = r"```json(.*?)```"
matches = re.findall(json_pattern, llm_output, re.DOTALL)
if not matches:
json_pattern_fallback = r"\{[^{}]*\}"
matches = re.findall(json_pattern_fallback, llm_output, re.DOTALL)
for json_string in matches:
json_string = json_string.strip()
try:
parsed_json = json.loads(json_string)
return parsed_json
except json.JSONDecodeError:
try:
json_string_clean = "".join(
char for char in json_string if ord(char) >= 32 and ord(char) != 127
)
parsed_json = json.loads(json_string_clean)
return parsed_json
except json.JSONDecodeError:
continue
return None
def query_model(system_prompt: str, prompt: str, client: OpenAI, model: str) -> Optional[str]:
"""Query the model with the given prompts using OpenAI-compatible API with rate limiting."""
try:
# Rate limiting: 1 request per second to avoid concurrency issues
time.sleep(1)
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
temperature=0.7,
max_tokens=4000
)
return response.choices[0].message.content
except Exception as e:
print(f"Error querying model: {e}")
# Wait before retry
time.sleep(2)
return None
def get_score(
paper_content: str,
reviewer_type: Optional[str] = None,
attempts: int = 3,
client: OpenAI = None,
model: str = None,
) -> Tuple[Optional[float], str, bool]:
"""Evaluates a research paper using an LLM reviewer."""
last_exception_message = ""
for attempt in range(attempts):
try:
template_instructions = """
Respond in the following format:
THOUGHT:
<THOUGHT>
REVIEW JSON:
```json
<JSON>
```
In <THOUGHT>, first briefly discuss your intuitions and reasoning for the evaluation.
Detail your high-level arguments, necessary choices and desired outcomes of the review.
In <JSON>, provide the review in JSON format with the following fields:
- "Summary": A summary of the paper content and its contributions.
- "Strengths": A list of strengths of the paper.
- "Weaknesses": A list of weaknesses of the paper.
- "Originality": A rating from 1 to 4 (low, medium, high, very high).
- "Quality": A rating from 1 to 4 (low, medium, high, very high).
- "Clarity": A rating from 1 to 4 (low, medium, high, very high).
- "Significance": A rating from 1 to 4 (low, medium, high, very high).
- "Questions": A set of clarifying questions to be answered by the paper authors.
- "Limitations": A set of limitations and potential negative societal impacts.
- "Ethical Concerns": A boolean value indicating whether there are ethical concerns.
- "Soundness": A rating from 1 to 4 (poor, fair, good, excellent).
- "Presentation": A rating from 1 to 4 (poor, fair, good, excellent).
- "Contribution": A rating from 1 to 4 (poor, fair, good, excellent).
- "Overall": A rating from 1 to 10 (very strong reject to award quality).
- "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute).
- "Decision": A decision that has to be one of: Accept, Reject.
"""
neurips_form = """
## Review Guidelines
Evaluate the paper across these dimensions:
1. **Originality**: Are the ideas novel? Is related work cited?
2. **Quality**: Is the work technically sound? Are claims well supported?
3. **Clarity**: Is the paper well-written and organized?
4. **Significance**: Are the results important? Will others build on this work?
5. **Soundness**: Rate the technical quality (1-4: poor, fair, good, excellent)
6. **Presentation**: Rate the writing quality (1-4: poor, fair, good, excellent)
7. **Contribution**: Rate the overall contribution (1-4: poor, fair, good, excellent)
8. **Overall Score**: Rate 1-10 where:
- 1-3: Reject
- 4-6: Borderline
- 7-8: Accept
- 9-10: Strong Accept
""" + template_instructions
if reviewer_type is None:
reviewer_type = ""
sys_prompt = (
f"You are an AI researcher reviewing an academic paper. "
f"Be critical and thorough in your assessment. {reviewer_type}\n"
) + neurips_form
prompt = f"Review the following paper:\n\n{paper_content}\n\n"
review_output = query_model(
system_prompt=sys_prompt,
prompt=prompt,
client=client,
model=model,
)
if review_output is None:
raise ValueError("LLM query returned None.")
review_json = extract_json_between_markers(review_output)
if review_json is None:
raise ValueError("Could not extract JSON review from LLM output.")
required_keys = [
"Overall", "Soundness", "Confidence", "Contribution",
"Presentation", "Clarity", "Originality", "Quality", "Significance",
]
for key in required_keys:
if key not in review_json:
raise KeyError(f"Missing key '{key}' in review JSON.")
# Calculate weighted score
overall = int(review_json["Overall"]) / 10.0
soundness = int(review_json["Soundness"]) / 4.0
confidence = int(review_json["Confidence"]) / 5.0
contribution = int(review_json["Contribution"]) / 4.0
presentation = int(review_json["Presentation"]) / 4.0
clarity = int(review_json["Clarity"]) / 4.0
originality = int(review_json["Originality"]) / 4.0
quality = int(review_json["Quality"]) / 4.0
significance = int(review_json["Significance"]) / 4.0
weights = {
"clarity": 0.1,
"quality": 0.1,
"overall": 1.0,
"soundness": 0.1,
"confidence": 0.1,
"originality": 0.1,
"significance": 0.1,
"contribution": 0.4,
"presentation": 0.2,
}
max_score = sum(weights.values())
performance = (
weights["soundness"] * soundness +
weights["presentation"] * presentation +
weights["confidence"] * confidence +
weights["contribution"] * contribution +
weights["overall"] * overall +
weights["originality"] * originality +
weights["significance"] * significance +
weights["clarity"] * clarity +
weights["quality"] * quality
) / max_score * 10.0
return (
performance,
f"Performance Score: {performance:.2f}/10\n\n{review_output}",
True,
)
except Exception as e:
print(f"Error in get_score (attempt {attempt + 1}/{attempts}): {e}")
last_exception_message = str(e)
return (
None,
f"Failed to get score after {attempts} attempts. Last error: {last_exception_message}",
False,
)
class ReviewerAgent:
"""Agent that simulates a single reviewer with specific persona."""
def __init__(self, client: OpenAI, model: str, persona: str, name: str):
self.client = client
self.model = model
self.persona = persona
self.name = name
def review_paper(self, paper_content: str) -> Dict[str, Any]:
"""Generate review for the paper."""
score, review_text, success = get_score(
paper_content=paper_content,
reviewer_type=self.persona,
client=self.client,
model=self.model,
)
return {
"reviewer": self.name,
"score": score,
"review": review_text,
"success": success
}
class MultiReviewerSystem:
"""System that coordinates multiple reviewer agents."""
def __init__(self, api_key: str, base_url: str, model: str):
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.model = model
self.reviewers = [
ReviewerAgent(
client=self.client,
model=self.model,
persona="You focus on experimental rigor and expect well-designed experiments with clear insights.",
name="Reviewer 1: Experimentalist"
),
ReviewerAgent(
client=self.client,
model=self.model,
persona="You look for impactful ideas that would advance the field significantly.",
name="Reviewer 2: Impactist"
),
ReviewerAgent(
client=self.client,
model=self.model,
persona="You seek novel ideas that have not been proposed before and creative approaches.",
name="Reviewer 3: Novelty Seeker"
)
]
def review_paper_sequential(self, paper_content: str, progress_callback=None) -> Dict[str, Any]:
"""Generate reviews from multiple reviewers sequentially."""
reviews = []
total_score = 0
successful_reviews = 0
for i, reviewer in enumerate(self.reviewers):
if progress_callback:
progress_callback(i / len(self.reviewers), f"Reviewing with {reviewer.name}...")
review_result = reviewer.review_paper(paper_content)
reviews.append(review_result)
if review_result["success"] and review_result["score"] is not None:
total_score += review_result["score"]
successful_reviews += 1
avg_score = total_score / successful_reviews if successful_reviews > 0 else 0
if progress_callback:
progress_callback(1.0, "Review complete!")
return {
"reviews": reviews,
"average_score": avg_score,
"total_reviewers": len(self.reviewers),
"successful_reviews": successful_reviews
}
|