Spaces:
Sleeping
Sleeping
| """Multi-agent system for literature review using OpenAI-compatible API.""" | |
| import json | |
| import re | |
| import os | |
| import time | |
| from typing import Any, Optional, Dict, Tuple | |
| from openai import OpenAI | |
| def extract_json_between_markers(llm_output: str) -> Optional[Dict[str, Any]]: | |
| """Extracts JSON content from a string, typically an LLM output.""" | |
| json_pattern = r"```json(.*?)```" | |
| matches = re.findall(json_pattern, llm_output, re.DOTALL) | |
| if not matches: | |
| json_pattern_fallback = r"\{[^{}]*\}" | |
| matches = re.findall(json_pattern_fallback, llm_output, re.DOTALL) | |
| for json_string in matches: | |
| json_string = json_string.strip() | |
| try: | |
| parsed_json = json.loads(json_string) | |
| return parsed_json | |
| except json.JSONDecodeError: | |
| try: | |
| json_string_clean = "".join( | |
| char for char in json_string if ord(char) >= 32 and ord(char) != 127 | |
| ) | |
| parsed_json = json.loads(json_string_clean) | |
| return parsed_json | |
| except json.JSONDecodeError: | |
| continue | |
| return None | |
| def query_model(system_prompt: str, prompt: str, client: OpenAI, model: str) -> Optional[str]: | |
| """Query the model with the given prompts using OpenAI-compatible API with rate limiting.""" | |
| try: | |
| # Rate limiting: 1 request per second to avoid concurrency issues | |
| time.sleep(1) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.7, | |
| max_tokens=4000 | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| print(f"Error querying model: {e}") | |
| # Wait before retry | |
| time.sleep(2) | |
| return None | |
| def get_score( | |
| paper_content: str, | |
| reviewer_type: Optional[str] = None, | |
| attempts: int = 3, | |
| client: OpenAI = None, | |
| model: str = None, | |
| ) -> Tuple[Optional[float], str, bool]: | |
| """Evaluates a research paper using an LLM reviewer.""" | |
| last_exception_message = "" | |
| for attempt in range(attempts): | |
| try: | |
| template_instructions = """ | |
| Respond in the following format: | |
| THOUGHT: | |
| <THOUGHT> | |
| REVIEW JSON: | |
| ```json | |
| <JSON> | |
| ``` | |
| In <THOUGHT>, first briefly discuss your intuitions and reasoning for the evaluation. | |
| Detail your high-level arguments, necessary choices and desired outcomes of the review. | |
| In <JSON>, provide the review in JSON format with the following fields: | |
| - "Summary": A summary of the paper content and its contributions. | |
| - "Strengths": A list of strengths of the paper. | |
| - "Weaknesses": A list of weaknesses of the paper. | |
| - "Originality": A rating from 1 to 4 (low, medium, high, very high). | |
| - "Quality": A rating from 1 to 4 (low, medium, high, very high). | |
| - "Clarity": A rating from 1 to 4 (low, medium, high, very high). | |
| - "Significance": A rating from 1 to 4 (low, medium, high, very high). | |
| - "Questions": A set of clarifying questions to be answered by the paper authors. | |
| - "Limitations": A set of limitations and potential negative societal impacts. | |
| - "Ethical Concerns": A boolean value indicating whether there are ethical concerns. | |
| - "Soundness": A rating from 1 to 4 (poor, fair, good, excellent). | |
| - "Presentation": A rating from 1 to 4 (poor, fair, good, excellent). | |
| - "Contribution": A rating from 1 to 4 (poor, fair, good, excellent). | |
| - "Overall": A rating from 1 to 10 (very strong reject to award quality). | |
| - "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute). | |
| - "Decision": A decision that has to be one of: Accept, Reject. | |
| """ | |
| neurips_form = """ | |
| ## Review Guidelines | |
| Evaluate the paper across these dimensions: | |
| 1. **Originality**: Are the ideas novel? Is related work cited? | |
| 2. **Quality**: Is the work technically sound? Are claims well supported? | |
| 3. **Clarity**: Is the paper well-written and organized? | |
| 4. **Significance**: Are the results important? Will others build on this work? | |
| 5. **Soundness**: Rate the technical quality (1-4: poor, fair, good, excellent) | |
| 6. **Presentation**: Rate the writing quality (1-4: poor, fair, good, excellent) | |
| 7. **Contribution**: Rate the overall contribution (1-4: poor, fair, good, excellent) | |
| 8. **Overall Score**: Rate 1-10 where: | |
| - 1-3: Reject | |
| - 4-6: Borderline | |
| - 7-8: Accept | |
| - 9-10: Strong Accept | |
| """ + template_instructions | |
| if reviewer_type is None: | |
| reviewer_type = "" | |
| sys_prompt = ( | |
| f"You are an AI researcher reviewing an academic paper. " | |
| f"Be critical and thorough in your assessment. {reviewer_type}\n" | |
| ) + neurips_form | |
| prompt = f"Review the following paper:\n\n{paper_content}\n\n" | |
| review_output = query_model( | |
| system_prompt=sys_prompt, | |
| prompt=prompt, | |
| client=client, | |
| model=model, | |
| ) | |
| if review_output is None: | |
| raise ValueError("LLM query returned None.") | |
| review_json = extract_json_between_markers(review_output) | |
| if review_json is None: | |
| raise ValueError("Could not extract JSON review from LLM output.") | |
| required_keys = [ | |
| "Overall", "Soundness", "Confidence", "Contribution", | |
| "Presentation", "Clarity", "Originality", "Quality", "Significance", | |
| ] | |
| for key in required_keys: | |
| if key not in review_json: | |
| raise KeyError(f"Missing key '{key}' in review JSON.") | |
| # Calculate weighted score | |
| overall = int(review_json["Overall"]) / 10.0 | |
| soundness = int(review_json["Soundness"]) / 4.0 | |
| confidence = int(review_json["Confidence"]) / 5.0 | |
| contribution = int(review_json["Contribution"]) / 4.0 | |
| presentation = int(review_json["Presentation"]) / 4.0 | |
| clarity = int(review_json["Clarity"]) / 4.0 | |
| originality = int(review_json["Originality"]) / 4.0 | |
| quality = int(review_json["Quality"]) / 4.0 | |
| significance = int(review_json["Significance"]) / 4.0 | |
| weights = { | |
| "clarity": 0.1, | |
| "quality": 0.1, | |
| "overall": 1.0, | |
| "soundness": 0.1, | |
| "confidence": 0.1, | |
| "originality": 0.1, | |
| "significance": 0.1, | |
| "contribution": 0.4, | |
| "presentation": 0.2, | |
| } | |
| max_score = sum(weights.values()) | |
| performance = ( | |
| weights["soundness"] * soundness + | |
| weights["presentation"] * presentation + | |
| weights["confidence"] * confidence + | |
| weights["contribution"] * contribution + | |
| weights["overall"] * overall + | |
| weights["originality"] * originality + | |
| weights["significance"] * significance + | |
| weights["clarity"] * clarity + | |
| weights["quality"] * quality | |
| ) / max_score * 10.0 | |
| return ( | |
| performance, | |
| f"Performance Score: {performance:.2f}/10\n\n{review_output}", | |
| True, | |
| ) | |
| except Exception as e: | |
| print(f"Error in get_score (attempt {attempt + 1}/{attempts}): {e}") | |
| last_exception_message = str(e) | |
| return ( | |
| None, | |
| f"Failed to get score after {attempts} attempts. Last error: {last_exception_message}", | |
| False, | |
| ) | |
| class ReviewerAgent: | |
| """Agent that simulates a single reviewer with specific persona.""" | |
| def __init__(self, client: OpenAI, model: str, persona: str, name: str): | |
| self.client = client | |
| self.model = model | |
| self.persona = persona | |
| self.name = name | |
| def review_paper(self, paper_content: str) -> Dict[str, Any]: | |
| """Generate review for the paper.""" | |
| score, review_text, success = get_score( | |
| paper_content=paper_content, | |
| reviewer_type=self.persona, | |
| client=self.client, | |
| model=self.model, | |
| ) | |
| return { | |
| "reviewer": self.name, | |
| "score": score, | |
| "review": review_text, | |
| "success": success | |
| } | |
| class MultiReviewerSystem: | |
| """System that coordinates multiple reviewer agents.""" | |
| def __init__(self, api_key: str, base_url: str, model: str): | |
| self.client = OpenAI(api_key=api_key, base_url=base_url) | |
| self.model = model | |
| self.reviewers = [ | |
| ReviewerAgent( | |
| client=self.client, | |
| model=self.model, | |
| persona="You focus on experimental rigor and expect well-designed experiments with clear insights.", | |
| name="Reviewer 1: Experimentalist" | |
| ), | |
| ReviewerAgent( | |
| client=self.client, | |
| model=self.model, | |
| persona="You look for impactful ideas that would advance the field significantly.", | |
| name="Reviewer 2: Impactist" | |
| ), | |
| ReviewerAgent( | |
| client=self.client, | |
| model=self.model, | |
| persona="You seek novel ideas that have not been proposed before and creative approaches.", | |
| name="Reviewer 3: Novelty Seeker" | |
| ) | |
| ] | |
| def review_paper_sequential(self, paper_content: str, progress_callback=None) -> Dict[str, Any]: | |
| """Generate reviews from multiple reviewers sequentially.""" | |
| reviews = [] | |
| total_score = 0 | |
| successful_reviews = 0 | |
| for i, reviewer in enumerate(self.reviewers): | |
| if progress_callback: | |
| progress_callback(i / len(self.reviewers), f"Reviewing with {reviewer.name}...") | |
| review_result = reviewer.review_paper(paper_content) | |
| reviews.append(review_result) | |
| if review_result["success"] and review_result["score"] is not None: | |
| total_score += review_result["score"] | |
| successful_reviews += 1 | |
| avg_score = total_score / successful_reviews if successful_reviews > 0 else 0 | |
| if progress_callback: | |
| progress_callback(1.0, "Review complete!") | |
| return { | |
| "reviews": reviews, | |
| "average_score": avg_score, | |
| "total_reviewers": len(self.reviewers), | |
| "successful_reviews": successful_reviews | |
| } | |