Spaces:

dwzhu
/

PaperBanana

Running

File size: 12,448 Bytes

587f33e

# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Evaluation toolkits for PaperVizAgent
"""

import json_repair
import json
import asyncio
import base64
import re
from google.genai import types

from prompts import diagram_eval_prompts, plot_eval_prompts
from utils.generation_utils import (
    call_gemini_with_retry_async,
    call_claude_with_retry_async,
    call_openai_with_retry_async,
)

# Prompt mapping: task_name -> eval_dim -> system_prompt
PROMPT_MAP = {
    "diagram": {
        "faithfulness": diagram_eval_prompts.DIAGRAM_REFERENCED_COMPARISON_FAITHFULNESS_SYSTEM_PROMPT,
        "conciseness": diagram_eval_prompts.DIAGRAM_REFERENCED_COMPARISON_CONCISENESS_SYSTEM_PROMPT,
        "readability": diagram_eval_prompts.DIAGRAM_REFERENCED_COMPARISON_READABILITY_SYSTEM_PROMPT,
        "aesthetics": diagram_eval_prompts.DIAGRAM_REFERENCED_COMPARISON_AESTHETICS_SYSTEM_PROMPT,
    },
    "plot": {
        "faithfulness": plot_eval_prompts.PLOT_REFERENCED_COMPARISON_FAITHFULNESS_SYSTEM_PROMPT,
        "conciseness": plot_eval_prompts.PLOT_REFERENCED_COMPARISON_CONCISENESS_SYSTEM_PROMPT,
        "readability": plot_eval_prompts.PLOT_REFERENCED_COMPARISON_READABILITY_SYSTEM_PROMPT,
        "aesthetics": plot_eval_prompts.PLOT_REFERENCED_COMPARISON_AESTHETICS_SYSTEM_PROMPT,
    },
}

# Task configuration: task_name -> field labels
TASK_CONFIG = {
    "diagram": {
        "visual_intent_label": "Diagram Caption",
        "raw_content_label": "Methodology Section",
        "human_label": "Human-Drawn Diagram (Human)",
        "model_label": "Model-Generated Diagram (Model)",
    },
    "plot": {
        "visual_intent_label": "Visual Intent of the Desired Plot",
        "raw_content_label": "Raw Data",
        "human_label": "Human-Drawn Plot (Human)",
        "model_label": "Model-Generated Plot (Model)",
    },
}


def _try_regex_extract_winner(text: str) -> str | None:
    """Try to extract winner field using regex as a fallback."""
    patterns = [
        r'"winner"\s*:\s*"([^"]+)"',  # Standard JSON: "winner": "value"
        r'\*\*winner\*\*\s*:\s*"([^"]+)"',  # Markdown bold: **winner**: "value" or **winner**:"value"
        r'\*\*winner\*\*\s*:\s*([A-Za-z][A-Za-z\s]+?)(?:,|\n|$)',  # Markdown bold without quotes: **winner**: value (capture until comma, newline, or end)
        r'"winner"\s*:\s*([A-Za-z][A-Za-z\s]+?)(?:,|\n|$)',  # Mixed format: "winner": value (no quotes on value, capture until comma, newline, or end)
        r'(?:\*\*|")winner(?:\*\*|")\s*:\s*(?:\*\*|")?([A-Za-z][A-Za-z\s]+?)(?:\*\*|"|,|\n|$)',  # Very flexible: any winner marker followed by colon and value
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            value = value.rstrip('*"').strip()
            return value
    
    return None


def _extract_winner_with_fallback(clean_json: str, eval_dim: str, valid_winners: list[str]) -> str:
    """Try regex extraction and return winner or 'Error'."""
    extracted = _try_regex_extract_winner(clean_json)
    if extracted and extracted in valid_winners:
        print(f"⚠️  {eval_dim}: regex extracted '{extracted}'")
        return extracted
    print(f"⚠️  {eval_dim}: failed to extract valid winner")
    return "Error"


def _determine_tier_outcome(dim1_outcome: str, dim2_outcome: str) -> str:
    """Determine the outcome for a tier given two dimension outcomes."""
    o1, o2 = dim1_outcome.strip(), dim2_outcome.strip()
    
    # Both agree on a clear winner
    if o1 == o2:
        if o1 in ["Both are good", "Both are bad"]:
            return "Tie"
        return o1
    
    # One Model, one neutral (Both are good/bad)
    if (o1 == "Model" and o2 in ["Both are good", "Both are bad"]) or \
       (o2 == "Model" and o1 in ["Both are good", "Both are bad"]):
        return "Model"
    
    # One Human, one neutral (Both are good/bad)
    if (o1 == "Human" and o2 in ["Both are good", "Both are bad"]) or \
       (o2 == "Human" and o1 in ["Both are good", "Both are bad"]):
        return "Human"
    
    # All other cases (conflicting winners, etc.) -> Tie
    return "Tie"


async def _run_single_eval_ref(
    task_name: str,
    eval_dim: str,
    raw_content: str,
    visual_intent: str,
    gt_image_base64: str,
    model_image_base64: str,
    model_name: str
) -> tuple[str, dict]:
    """Run a single evaluation dimension for referenced comparison."""
    # Get the appropriate prompt based on task_name and eval_dim
    sys_prompt = PROMPT_MAP[task_name][eval_dim]

    # Get task-specific labels
    if task_name not in TASK_CONFIG:
        raise ValueError(f"Invalid task name: {task_name}")
    
    config = TASK_CONFIG[task_name]
    
    # Construct input text based on eval dimension
    if eval_dim in ["readability", "aesthetics"]:
        input_text = f"{config['visual_intent_label']}: {visual_intent}\n{config['human_label']}: "
    else:
        input_text = f"{config['raw_content_label']}: {raw_content}\n{config['visual_intent_label']}: {visual_intent}\n{config['human_label']}: "

    # Construct content list
    content_list = [
        {"type": "text", "text": input_text},
        {
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/jpeg",
                "data": gt_image_base64,
            },
        },
        {"type": "text", "text": f"\n{config['model_label']}: "},
        {
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/jpeg",
                "data": model_image_base64,
            },
        },
    ]

    valid_winners = ["Human", "Model", "Both are good", "Both are bad"]
    
    try:
        if "gemini" in model_name:
            response_text_list = await call_gemini_with_retry_async(
                model_name=model_name,
                contents=content_list,
                config=types.GenerateContentConfig(
                    system_instruction=sys_prompt,
                    temperature=1,
                    candidate_count=1,
                    max_output_tokens=50000,
                ),
            )
        elif "gpt" in model_name or "o1" in model_name or "o3" in model_name:
            response_text_list = await call_openai_with_retry_async(
                model_name=model_name,
                contents=content_list,
                config={
                    "system_prompt": sys_prompt,
                    "temperature": 1,
                    "candidate_num": 1,
                    "max_completion_tokens": 10000,
                },
                max_attempts=5,
                retry_delay=30,
            )
        else:
            response_text_list = await call_claude_with_retry_async(
                model_name=model_name,
                contents=content_list,
                config={
                    "system_prompt": sys_prompt,
                    "temperature": 1,
                    "candidate_num": 1,
                    "max_output_tokens": 10000,
                },
                max_attempts=5,
                retry_delay=30,
            )
        clean_json = response_text_list[0].replace("```json", "").replace("```", "").strip()
        res_obj = json_repair.loads(clean_json)
        
        if not isinstance(res_obj, dict):
            res_obj = {
                "comparison_reasoning": clean_json,
                "winner": _extract_winner_with_fallback(clean_json, eval_dim, valid_winners)
            }
        elif "winner" not in res_obj:
            res_obj["winner"] = _extract_winner_with_fallback(clean_json, eval_dim, valid_winners)
            if "comparison_reasoning" not in res_obj:
                res_obj["comparison_reasoning"] = clean_json
        
        return eval_dim, res_obj
    except Exception as e:
        print(f"❌ {eval_dim}: Evaluation failed - {str(e)[:100]}")
        extracted = _try_regex_extract_winner(clean_json) if 'clean_json' in locals() else None
        winner = extracted if (extracted and extracted in valid_winners) else "Error"
        return eval_dim, {"comparison_reasoning": str(e), "winner": winner}


async def get_score_for_image_referenced(
    sample_data: dict, task_name: str = "diagram", model_name: str = "", work_dir = None
) -> dict:
    """Get score for diagram referenced comparison.
    
    Args:
        sample_data: Sample data dictionary
        task_name: Task name (diagram or plot)
        model_name: Model name for evaluation
        work_dir: Work directory path for resolving relative paths (pathlib.Path)
    """
    from pathlib import Path

    raw_content = sample_data["content"]
    visual_intent = sample_data["visual_intent"]
    
    if "path_to_gt_image" not in sample_data:
        print("⚠️  No ground truth image path found. Skipping evaluation.")
        for dim in ["faithfulness", "conciseness", "readability", "aesthetics", "overall"]:
             sample_data[f"{dim}_outcome"] = "N/A - No GT"
        return sample_data

    path_to_gt_image_rel = sample_data["path_to_gt_image"]
    
    # Resolve relative path using work_dir
    if work_dir:
        path_to_gt_image = work_dir / f"data/PaperBananaBench/{task_name}" / path_to_gt_image_rel
    else:
        # Fallback for backward compatibility (assume it's already absolute)
        path_to_gt_image = Path(path_to_gt_image_rel)

    with open(path_to_gt_image, "rb") as f:
        gt_image_base64 = base64.b64encode(f.read()).decode("utf-8")

    eval_image_field = sample_data["eval_image_field"]
    
    # Check if image was successfully generated
    if eval_image_field not in sample_data:
        print(f"⚠️  Image field '{eval_image_field}' not found. Model generation failed - counting as Human win.")
        # Model failed to generate image, Human wins by default
        for dim in ["faithfulness", "conciseness", "readability", "aesthetics", "overall"]:
            sample_data[f"{dim}_reasoning"] = "Model failed to generate image - Human wins by default"
            sample_data[f"{dim}_outcome"] = "Human"
        return sample_data
    
    model_image_base64 = sample_data[eval_image_field]

    # Run evaluations for all dimensions
    dims = ["faithfulness", "conciseness", "readability", "aesthetics"]
    tasks = [
        _run_single_eval_ref(
            task_name,
            dim,
            raw_content,
            visual_intent,
            gt_image_base64,
            model_image_base64,
            model_name
        ) for dim in dims
    ]

    results = await asyncio.gather(*tasks)
    for eval_dim, res_obj in results:
        sample_data[f"{eval_dim}_reasoning"] = res_obj.get("comparison_reasoning", "")
        sample_data[f"{eval_dim}_outcome"] = res_obj.get("winner", "Unknown")

    faithfulness = sample_data.get("faithfulness_outcome", "Unknown")
    readability = sample_data.get("readability_outcome", "Unknown")
    conciseness = sample_data.get("conciseness_outcome", "Unknown")
    aesthetics = sample_data.get("aesthetics_outcome", "Unknown")
    
    # Tier 1: Faithfulness + Readability
    tier1_outcome = _determine_tier_outcome(faithfulness, readability)
    
    if tier1_outcome in ["Model", "Human"]:
        overall_outcome = tier1_outcome
        decision_path = f"Tier1({faithfulness}, {readability}) -> {tier1_outcome} [Decided at Tier 1]"
    else:
        # Tier 1 is tied, check Tier 2
        tier2_outcome = _determine_tier_outcome(conciseness, aesthetics)
        overall_outcome = tier2_outcome
        decision_path = f"Tier1({faithfulness}, {readability}) -> Tie; Tier2({conciseness}, {aesthetics}) -> {tier2_outcome} [Decided at Tier 2]"
    
    sample_data["overall_outcome"] = overall_outcome
    sample_data["overall_reasoning"] = f"Rule-based calculation: {decision_path}"

    return sample_data