Spaces:

syaikhipin
/

PaperReview

Sleeping

File size: 11,157 Bytes

"""Multi-agent system for literature review using OpenAI-compatible API."""

import json
import re
import os
import time
from typing import Any, Optional, Dict, Tuple
from openai import OpenAI


def extract_json_between_markers(llm_output: str) -> Optional[Dict[str, Any]]:
    """Extracts JSON content from a string, typically an LLM output."""
    json_pattern = r"```json(.*?)```"
    matches = re.findall(json_pattern, llm_output, re.DOTALL)

    if not matches:
        json_pattern_fallback = r"\{[^{}]*\}"
        matches = re.findall(json_pattern_fallback, llm_output, re.DOTALL)

    for json_string in matches:
        json_string = json_string.strip()
        try:
            parsed_json = json.loads(json_string)
            return parsed_json
        except json.JSONDecodeError:
            try:
                json_string_clean = "".join(
                    char for char in json_string if ord(char) >= 32 and ord(char) != 127
                )
                parsed_json = json.loads(json_string_clean)
                return parsed_json
            except json.JSONDecodeError:
                continue

    return None


def query_model(system_prompt: str, prompt: str, client: OpenAI, model: str) -> Optional[str]:
    """Query the model with the given prompts using OpenAI-compatible API with rate limiting."""
    try:
        # Rate limiting: 1 request per second to avoid concurrency issues
        time.sleep(1)

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=4000
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error querying model: {e}")
        # Wait before retry
        time.sleep(2)
        return None


def get_score(
    paper_content: str,
    reviewer_type: Optional[str] = None,
    attempts: int = 3,
    client: OpenAI = None,
    model: str = None,
) -> Tuple[Optional[float], str, bool]:
    """Evaluates a research paper using an LLM reviewer."""

    last_exception_message = ""
    for attempt in range(attempts):
        try:
            template_instructions = """
            Respond in the following format:

            THOUGHT:
            <THOUGHT>

            REVIEW JSON:
            ```json
            <JSON>
            ```

            In <THOUGHT>, first briefly discuss your intuitions and reasoning for the evaluation.
            Detail your high-level arguments, necessary choices and desired outcomes of the review.

            In <JSON>, provide the review in JSON format with the following fields:
            - "Summary": A summary of the paper content and its contributions.
            - "Strengths": A list of strengths of the paper.
            - "Weaknesses": A list of weaknesses of the paper.
            - "Originality": A rating from 1 to 4 (low, medium, high, very high).
            - "Quality": A rating from 1 to 4 (low, medium, high, very high).
            - "Clarity": A rating from 1 to 4 (low, medium, high, very high).
            - "Significance": A rating from 1 to 4 (low, medium, high, very high).
            - "Questions": A set of clarifying questions to be answered by the paper authors.
            - "Limitations": A set of limitations and potential negative societal impacts.
            - "Ethical Concerns": A boolean value indicating whether there are ethical concerns.
            - "Soundness": A rating from 1 to 4 (poor, fair, good, excellent).
            - "Presentation": A rating from 1 to 4 (poor, fair, good, excellent).
            - "Contribution": A rating from 1 to 4 (poor, fair, good, excellent).
            - "Overall": A rating from 1 to 10 (very strong reject to award quality).
            - "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute).
            - "Decision": A decision that has to be one of: Accept, Reject.
            """

            neurips_form = """
            ## Review Guidelines

            Evaluate the paper across these dimensions:

            1. **Originality**: Are the ideas novel? Is related work cited?
            2. **Quality**: Is the work technically sound? Are claims well supported?
            3. **Clarity**: Is the paper well-written and organized?
            4. **Significance**: Are the results important? Will others build on this work?
            5. **Soundness**: Rate the technical quality (1-4: poor, fair, good, excellent)
            6. **Presentation**: Rate the writing quality (1-4: poor, fair, good, excellent)
            7. **Contribution**: Rate the overall contribution (1-4: poor, fair, good, excellent)
            8. **Overall Score**: Rate 1-10 where:
               - 1-3: Reject
               - 4-6: Borderline
               - 7-8: Accept
               - 9-10: Strong Accept

            """ + template_instructions

            if reviewer_type is None:
                reviewer_type = ""

            sys_prompt = (
                f"You are an AI researcher reviewing an academic paper. "
                f"Be critical and thorough in your assessment. {reviewer_type}\n"
            ) + neurips_form

            prompt = f"Review the following paper:\n\n{paper_content}\n\n"

            review_output = query_model(
                system_prompt=sys_prompt,
                prompt=prompt,
                client=client,
                model=model,
            )

            if review_output is None:
                raise ValueError("LLM query returned None.")

            review_json = extract_json_between_markers(review_output)

            if review_json is None:
                raise ValueError("Could not extract JSON review from LLM output.")

            required_keys = [
                "Overall", "Soundness", "Confidence", "Contribution",
                "Presentation", "Clarity", "Originality", "Quality", "Significance",
            ]

            for key in required_keys:
                if key not in review_json:
                    raise KeyError(f"Missing key '{key}' in review JSON.")

            # Calculate weighted score
            overall = int(review_json["Overall"]) / 10.0
            soundness = int(review_json["Soundness"]) / 4.0
            confidence = int(review_json["Confidence"]) / 5.0
            contribution = int(review_json["Contribution"]) / 4.0
            presentation = int(review_json["Presentation"]) / 4.0
            clarity = int(review_json["Clarity"]) / 4.0
            originality = int(review_json["Originality"]) / 4.0
            quality = int(review_json["Quality"]) / 4.0
            significance = int(review_json["Significance"]) / 4.0

            weights = {
                "clarity": 0.1,
                "quality": 0.1,
                "overall": 1.0,
                "soundness": 0.1,
                "confidence": 0.1,
                "originality": 0.1,
                "significance": 0.1,
                "contribution": 0.4,
                "presentation": 0.2,
            }

            max_score = sum(weights.values())

            performance = (
                weights["soundness"] * soundness +
                weights["presentation"] * presentation +
                weights["confidence"] * confidence +
                weights["contribution"] * contribution +
                weights["overall"] * overall +
                weights["originality"] * originality +
                weights["significance"] * significance +
                weights["clarity"] * clarity +
                weights["quality"] * quality
            ) / max_score * 10.0

            return (
                performance,
                f"Performance Score: {performance:.2f}/10\n\n{review_output}",
                True,
            )

        except Exception as e:
            print(f"Error in get_score (attempt {attempt + 1}/{attempts}): {e}")
            last_exception_message = str(e)

    return (
        None,
        f"Failed to get score after {attempts} attempts. Last error: {last_exception_message}",
        False,
    )


class ReviewerAgent:
    """Agent that simulates a single reviewer with specific persona."""

    def __init__(self, client: OpenAI, model: str, persona: str, name: str):
        self.client = client
        self.model = model
        self.persona = persona
        self.name = name

    def review_paper(self, paper_content: str) -> Dict[str, Any]:
        """Generate review for the paper."""
        score, review_text, success = get_score(
            paper_content=paper_content,
            reviewer_type=self.persona,
            client=self.client,
            model=self.model,
        )

        return {
            "reviewer": self.name,
            "score": score,
            "review": review_text,
            "success": success
        }


class MultiReviewerSystem:
    """System that coordinates multiple reviewer agents."""

    def __init__(self, api_key: str, base_url: str, model: str):
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.model = model

        self.reviewers = [
            ReviewerAgent(
                client=self.client,
                model=self.model,
                persona="You focus on experimental rigor and expect well-designed experiments with clear insights.",
                name="Reviewer 1: Experimentalist"
            ),
            ReviewerAgent(
                client=self.client,
                model=self.model,
                persona="You look for impactful ideas that would advance the field significantly.",
                name="Reviewer 2: Impactist"
            ),
            ReviewerAgent(
                client=self.client,
                model=self.model,
                persona="You seek novel ideas that have not been proposed before and creative approaches.",
                name="Reviewer 3: Novelty Seeker"
            )
        ]

    def review_paper_sequential(self, paper_content: str, progress_callback=None) -> Dict[str, Any]:
        """Generate reviews from multiple reviewers sequentially."""
        reviews = []
        total_score = 0
        successful_reviews = 0

        for i, reviewer in enumerate(self.reviewers):
            if progress_callback:
                progress_callback(i / len(self.reviewers), f"Reviewing with {reviewer.name}...")

            review_result = reviewer.review_paper(paper_content)
            reviews.append(review_result)

            if review_result["success"] and review_result["score"] is not None:
                total_score += review_result["score"]
                successful_reviews += 1

        avg_score = total_score / successful_reviews if successful_reviews > 0 else 0

        if progress_callback:
            progress_callback(1.0, "Review complete!")

        return {
            "reviews": reviews,
            "average_score": avg_score,
            "total_reviewers": len(self.reviewers),
            "successful_reviews": successful_reviews
        }