File size: 10,870 Bytes
a388fcc
 
 
5322748
a388fcc
adec6c3
 
 
5322748
adec6c3
 
 
 
 
 
 
c5b8c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adec6c3
c5b8c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import gradio as gr
import sys
import os
import torch

def setup_cpu_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    
    torch.set_num_threads(4) 
    
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    
    os.environ['TRANSFORMERS_CACHE'] = './cache'

setup_cpu_environment()

from RadEval import RadEval

def run_radeval_simple(ref_text, hyp_text, selected_metrics):
    """
    Run RadEval with selected metrics on a pair of reference and hypothesis texts
    """
    try:
        
        refs = [ref_text.strip()]
        hyps = [hyp_text.strip()]
        
        # Configure RadEval based on selected metrics
        config = {
            'do_radgraph': 'RadGraph F1' in selected_metrics,
            'do_bleu': 'BLEU' in selected_metrics,
            'do_rouge': 'ROUGE' in selected_metrics,
            'do_bertscore': 'BERTScore' in selected_metrics,
            'do_chexbert': 'CheXbert F1' in selected_metrics,
            'do_ratescore': 'RaTEScore' in selected_metrics,
            'do_radcliq': 'RadCliQ' in selected_metrics,
            'do_temporal': 'Temporal F1' in selected_metrics,
            'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics,
            'do_green': 'GREEN' in selected_metrics,
            'do_srr_bert': 'SRR-BERT' in selected_metrics
        }
        
        # Initialize RadEval with selected metrics
        evaluator = RadEval(**config)
        
        # Run evaluation
        results = evaluator(refs=refs, hyps=hyps)
        
        # Prepare results for display
        table_data = []
        analysis_text = "## RadEval Results\n\n"
        analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n"
        analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n"
        analysis_text += "### Evaluation Scores:\n\n"
        
        for metric, score in results.items():
            if isinstance(score, (int, float)):
                formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score)
                table_data.append([metric, formatted_score])
                analysis_text += f"- **{metric}**: {formatted_score}\n"
            elif isinstance(score, dict):
                # Handle nested metrics
                for sub_metric, sub_score in score.items():
                    if isinstance(sub_score, (int, float)):
                        formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score)
                        metric_name = f"{metric}_{sub_metric}"
                        table_data.append([metric_name, formatted_score])
                        analysis_text += f"- **{metric_name}**: {formatted_score}\n"
        
        if not table_data:
            return "No metrics were computed. Please select at least one metric.", [["No results", ""]]
            
        return analysis_text, table_data
        
    except ImportError as e:
        error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed."
        return error_msg, [["Error", error_msg]]
    except Exception as e:
        error_msg = f"Evaluation Error: {str(e)}"
        return error_msg, [["Error", error_msg]]


# Example pairs for radiology reports
examples = {
    "Normal vs Normal": {
        "ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.",
        "hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.",
    },
    "Pneumonia Case": {
        "ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.",
        "hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.",
    },
    "Temporal Comparison": {
        "ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.",
        "hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.",
    },
    "Discordant Reports": {
        "ref": "No acute cardiopulmonary process. Normal heart size and lung fields.",
        "hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.",
    },
    "Ambiguous Language": {
        "ref": "There is a small left-sided pleural effusion with adjacent atelectasis.",
        "hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.",
    },
    "Surgical Follow-up": {
        "ref": "Status post coronary artery bypass grafting. No evidence of acute complication.",
        "hyp": "Post-operative changes from CABG are present. No signs of surgical complication.",
    },
    "False Positive": {
        "ref": "No focal consolidation, pleural effusion, or pneumothorax identified.",
        "hyp": "Right lower lobe consolidation concerning for pneumonia.",
    },
    "Textual Hallucination": {
        "ref": "Heart and mediastinum are normal. Lungs are clear.",
        "hyp": "Large left pleural effusion with mediastinal shift to the right.",
    },
    "Negation Challenge": {
        "ref": "No evidence of pneumothorax or pleural effusion.",
        "hyp": "Evidence of small pneumothorax on the right.",
    },
    "Fine-grained Difference": {
        "ref": "Mild interstitial markings at the lung bases, likely chronic.",
        "hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.",
    }
}

def update_fields(choice):
    """Update text fields based on example selection"""
    if choice == "Custom":
        return gr.update(value="", interactive=True), gr.update(value="", interactive=True)
    else:
        return (
            gr.update(value=examples[choice]["ref"], interactive=False), 
            gr.update(value=examples[choice]["hyp"], interactive=False)
        )


# Available metrics (ordered by computational complexity)
available_metrics = [
    "BLEU",
    "ROUGE", 
    "BERTScore",
    "Temporal F1",
    "RadEval BERTScore",
    "RaTEScore",
    "RadCliQ",
    "SRR-BERT",
    "CheXbert F1",
    "RadGraph F1",
    "GREEN"
]

# Fast metrics for default selection
default_metrics = ["BLEU", "ROUGE", "BERTScore"]


with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🩺 RadEval:  A framework for radiology text evaluation
        [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval/) | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()

        **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.

        **⚠️ Performance Warning ⚠️**

        The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient.
        """
    )

    with gr.Row():
        choice = gr.Radio(
            label="πŸ“‹ Choose Example or Custom Input",
            choices=["Custom"] + list(examples.keys()),
            value="Custom",
            interactive=True
        )

    with gr.Row():
        with gr.Column(scale=1):
            ref_input = gr.Textbox(
                label="πŸ“„ Reference Report (Ground Truth)",
                lines=5,
                placeholder="Enter the reference radiology report here...",
                info="The ground truth or expert-written report"
            )
        with gr.Column(scale=1):
            hyp_input = gr.Textbox(
                label="πŸ€– Hypothesis Report (Generated)",
                lines=5,
                placeholder="Enter the generated/predicted radiology report here...",
                info="The AI-generated or system-produced report"
            )

    choice.change(
        update_fields,
        inputs=choice,
        outputs=[ref_input, hyp_input],
    )

    with gr.Row():
        metrics_selection = gr.CheckboxGroup(
            label="🎯 Select Evaluation Metrics",
            choices=available_metrics,
            value=default_metrics,
            interactive=True,
            info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)."
        )

    with gr.Row():
        run_button = gr.Button("πŸš€ Run RadEval", variant="primary", size="lg")
        
    with gr.Row():
        with gr.Column(scale=2):
            analysis_output = gr.Markdown(
                value="πŸ“Š **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'."
            )
        with gr.Column(scale=1):
            table_output = gr.DataFrame(
                label="πŸ“ˆ Detailed Scores",
                headers=["Metric", "Score"],
                wrap=True
            )

    # Information section
    with gr.Accordion("πŸ’‘ Metric Information", open=False):
        gr.Markdown(
            """
            ### πŸ“Š Available Metrics:
            
            **Traditional NLG Metrics:**
            - **BLEU**: N-gram overlap between reference and hypothesis
            - **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L)
            - **BERTScore**: Semantic similarity using BERT embeddings
            
            **Radiology-Specific Metrics:**
            - **RadGraph F1**: Entity and relation extraction for radiology
            - **CheXbert F1**: Chest X-ray finding classification performance
            - **RaTEScore**: Radiology-aware text evaluation score
            - **RadCliQ**: Composite metric for radiology reports
            - **Temporal F1**: Temporal entity and relationship evaluation
            - **RadEval BERTScore**: Specialized BERT for radiology text
            - **GREEN**: Generative evaluation with natural language explanations
            - **SRR-BERT**: Structured radiology reasoning evaluation
            
            ### ⚑ Performance Notes:
            - **Fast**: BLEU, ROUGE, BERTScore, Temporal F1
            - **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT
            - **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads)
            """
        )

    run_button.click(
        run_radeval_simple,
        inputs=[ref_input, hyp_input, metrics_selection],
        outputs=[analysis_output, table_output]
    )

if __name__ == "__main__":
    demo.launch()