Spaces:

X-iZhang
/

RadEval

Running

File size: 10,870 Bytes

import gradio as gr
import sys
import os
import torch

def setup_cpu_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    
    torch.set_num_threads(4) 
    
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    
    os.environ['TRANSFORMERS_CACHE'] = './cache'

setup_cpu_environment()

from RadEval import RadEval

def run_radeval_simple(ref_text, hyp_text, selected_metrics):
    """
    Run RadEval with selected metrics on a pair of reference and hypothesis texts
    """
    try:
        
        refs = [ref_text.strip()]
        hyps = [hyp_text.strip()]
        
        # Configure RadEval based on selected metrics
        config = {
            'do_radgraph': 'RadGraph F1' in selected_metrics,
            'do_bleu': 'BLEU' in selected_metrics,
            'do_rouge': 'ROUGE' in selected_metrics,
            'do_bertscore': 'BERTScore' in selected_metrics,
            'do_chexbert': 'CheXbert F1' in selected_metrics,
            'do_ratescore': 'RaTEScore' in selected_metrics,
            'do_radcliq': 'RadCliQ' in selected_metrics,
            'do_temporal': 'Temporal F1' in selected_metrics,
            'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics,
            'do_green': 'GREEN' in selected_metrics,
            'do_srr_bert': 'SRR-BERT' in selected_metrics
        }
        
        # Initialize RadEval with selected metrics
        evaluator = RadEval(**config)
        
        # Run evaluation
        results = evaluator(refs=refs, hyps=hyps)
        
        # Prepare results for display
        table_data = []
        analysis_text = "## RadEval Results\n\n"
        analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n"
        analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n"
        analysis_text += "### Evaluation Scores:\n\n"
        
        for metric, score in results.items():
            if isinstance(score, (int, float)):
                formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score)
                table_data.append([metric, formatted_score])
                analysis_text += f"- **{metric}**: {formatted_score}\n"
            elif isinstance(score, dict):
                # Handle nested metrics
                for sub_metric, sub_score in score.items():
                    if isinstance(sub_score, (int, float)):
                        formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score)
                        metric_name = f"{metric}_{sub_metric}"
                        table_data.append([metric_name, formatted_score])
                        analysis_text += f"- **{metric_name}**: {formatted_score}\n"
        
        if not table_data:
            return "No metrics were computed. Please select at least one metric.", [["No results", ""]]
            
        return analysis_text, table_data
        
    except ImportError as e:
        error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed."
        return error_msg, [["Error", error_msg]]
    except Exception as e:
        error_msg = f"Evaluation Error: {str(e)}"
        return error_msg, [["Error", error_msg]]


# Example pairs for radiology reports
examples = {
    "Normal vs Normal": {
        "ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.",
        "hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.",
    },
    "Pneumonia Case": {
        "ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.",
        "hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.",
    },
    "Temporal Comparison": {
        "ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.",
        "hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.",
    },
    "Discordant Reports": {
        "ref": "No acute cardiopulmonary process. Normal heart size and lung fields.",
        "hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.",
    },
    "Ambiguous Language": {
        "ref": "There is a small left-sided pleural effusion with adjacent atelectasis.",
        "hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.",
    },
    "Surgical Follow-up": {
        "ref": "Status post coronary artery bypass grafting. No evidence of acute complication.",
        "hyp": "Post-operative changes from CABG are present. No signs of surgical complication.",
    },
    "False Positive": {
        "ref": "No focal consolidation, pleural effusion, or pneumothorax identified.",
        "hyp": "Right lower lobe consolidation concerning for pneumonia.",
    },
    "Textual Hallucination": {
        "ref": "Heart and mediastinum are normal. Lungs are clear.",
        "hyp": "Large left pleural effusion with mediastinal shift to the right.",
    },
    "Negation Challenge": {
        "ref": "No evidence of pneumothorax or pleural effusion.",
        "hyp": "Evidence of small pneumothorax on the right.",
    },
    "Fine-grained Difference": {
        "ref": "Mild interstitial markings at the lung bases, likely chronic.",
        "hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.",
    }
}

def update_fields(choice):
    """Update text fields based on example selection"""
    if choice == "Custom":
        return gr.update(value="", interactive=True), gr.update(value="", interactive=True)
    else:
        return (
            gr.update(value=examples[choice]["ref"], interactive=False), 
            gr.update(value=examples[choice]["hyp"], interactive=False)
        )


# Available metrics (ordered by computational complexity)
available_metrics = [
    "BLEU",
    "ROUGE", 
    "BERTScore",
    "Temporal F1",
    "RadEval BERTScore",
    "RaTEScore",
    "RadCliQ",
    "SRR-BERT",
    "CheXbert F1",
    "RadGraph F1",
    "GREEN"
]

# Fast metrics for default selection
default_metrics = ["BLEU", "ROUGE", "BERTScore"]


with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🩺 RadEval:  A framework for radiology text evaluation
        [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval/) | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()

        **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.

        **⚠️ Performance Warning ⚠️**

        The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient.
        """
    )

    with gr.Row():
        choice = gr.Radio(
            label="📋 Choose Example or Custom Input",
            choices=["Custom"] + list(examples.keys()),
            value="Custom",
            interactive=True
        )

    with gr.Row():
        with gr.Column(scale=1):
            ref_input = gr.Textbox(
                label="📄 Reference Report (Ground Truth)",
                lines=5,
                placeholder="Enter the reference radiology report here...",
                info="The ground truth or expert-written report"
            )
        with gr.Column(scale=1):
            hyp_input = gr.Textbox(
                label="🤖 Hypothesis Report (Generated)",
                lines=5,
                placeholder="Enter the generated/predicted radiology report here...",
                info="The AI-generated or system-produced report"
            )

    choice.change(
        update_fields,
        inputs=choice,
        outputs=[ref_input, hyp_input],
    )

    with gr.Row():
        metrics_selection = gr.CheckboxGroup(
            label="🎯 Select Evaluation Metrics",
            choices=available_metrics,
            value=default_metrics,
            interactive=True,
            info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)."
        )

    with gr.Row():
        run_button = gr.Button("🚀 Run RadEval", variant="primary", size="lg")
        
    with gr.Row():
        with gr.Column(scale=2):
            analysis_output = gr.Markdown(
                value="📊 **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'."
            )
        with gr.Column(scale=1):
            table_output = gr.DataFrame(
                label="📈 Detailed Scores",
                headers=["Metric", "Score"],
                wrap=True
            )

    # Information section
    with gr.Accordion("💡 Metric Information", open=False):
        gr.Markdown(
            """
            ### 📊 Available Metrics:
            
            **Traditional NLG Metrics:**
            - **BLEU**: N-gram overlap between reference and hypothesis
            - **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L)
            - **BERTScore**: Semantic similarity using BERT embeddings
            
            **Radiology-Specific Metrics:**
            - **RadGraph F1**: Entity and relation extraction for radiology
            - **CheXbert F1**: Chest X-ray finding classification performance
            - **RaTEScore**: Radiology-aware text evaluation score
            - **RadCliQ**: Composite metric for radiology reports
            - **Temporal F1**: Temporal entity and relationship evaluation
            - **RadEval BERTScore**: Specialized BERT for radiology text
            - **GREEN**: Generative evaluation with natural language explanations
            - **SRR-BERT**: Structured radiology reasoning evaluation
            
            ### ⚡ Performance Notes:
            - **Fast**: BLEU, ROUGE, BERTScore, Temporal F1
            - **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT
            - **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads)
            """
        )

    run_button.click(
        run_radeval_simple,
        inputs=[ref_input, hyp_input, metrics_selection],
        outputs=[analysis_output, table_output]
    )

if __name__ == "__main__":
    demo.launch()