|
|
import gradio as gr |
|
|
import sys |
|
|
import os |
|
|
import torch |
|
|
|
|
|
def setup_cpu_environment(): |
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '' |
|
|
|
|
|
torch.set_num_threads(4) |
|
|
|
|
|
os.environ['TOKENIZERS_PARALLELISM'] = 'false' |
|
|
|
|
|
os.environ['TRANSFORMERS_CACHE'] = './cache' |
|
|
|
|
|
setup_cpu_environment() |
|
|
|
|
|
from RadEval import RadEval |
|
|
|
|
|
def run_radeval_simple(ref_text, hyp_text, selected_metrics): |
|
|
""" |
|
|
Run RadEval with selected metrics on a pair of reference and hypothesis texts |
|
|
""" |
|
|
try: |
|
|
|
|
|
refs = [ref_text.strip()] |
|
|
hyps = [hyp_text.strip()] |
|
|
|
|
|
|
|
|
config = { |
|
|
'do_radgraph': 'RadGraph F1' in selected_metrics, |
|
|
'do_bleu': 'BLEU' in selected_metrics, |
|
|
'do_rouge': 'ROUGE' in selected_metrics, |
|
|
'do_bertscore': 'BERTScore' in selected_metrics, |
|
|
'do_chexbert': 'CheXbert F1' in selected_metrics, |
|
|
'do_ratescore': 'RaTEScore' in selected_metrics, |
|
|
'do_radcliq': 'RadCliQ' in selected_metrics, |
|
|
'do_temporal': 'Temporal F1' in selected_metrics, |
|
|
'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics, |
|
|
'do_green': 'GREEN' in selected_metrics, |
|
|
'do_srr_bert': 'SRR-BERT' in selected_metrics |
|
|
} |
|
|
|
|
|
|
|
|
evaluator = RadEval(**config) |
|
|
|
|
|
|
|
|
results = evaluator(refs=refs, hyps=hyps) |
|
|
|
|
|
|
|
|
table_data = [] |
|
|
analysis_text = "## RadEval Results\n\n" |
|
|
analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n" |
|
|
analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n" |
|
|
analysis_text += "### Evaluation Scores:\n\n" |
|
|
|
|
|
for metric, score in results.items(): |
|
|
if isinstance(score, (int, float)): |
|
|
formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score) |
|
|
table_data.append([metric, formatted_score]) |
|
|
analysis_text += f"- **{metric}**: {formatted_score}\n" |
|
|
elif isinstance(score, dict): |
|
|
|
|
|
for sub_metric, sub_score in score.items(): |
|
|
if isinstance(sub_score, (int, float)): |
|
|
formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score) |
|
|
metric_name = f"{metric}_{sub_metric}" |
|
|
table_data.append([metric_name, formatted_score]) |
|
|
analysis_text += f"- **{metric_name}**: {formatted_score}\n" |
|
|
|
|
|
if not table_data: |
|
|
return "No metrics were computed. Please select at least one metric.", [["No results", ""]] |
|
|
|
|
|
return analysis_text, table_data |
|
|
|
|
|
except ImportError as e: |
|
|
error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed." |
|
|
return error_msg, [["Error", error_msg]] |
|
|
except Exception as e: |
|
|
error_msg = f"Evaluation Error: {str(e)}" |
|
|
return error_msg, [["Error", error_msg]] |
|
|
|
|
|
|
|
|
|
|
|
examples = { |
|
|
"Normal vs Normal": { |
|
|
"ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.", |
|
|
"hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.", |
|
|
}, |
|
|
"Pneumonia Case": { |
|
|
"ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.", |
|
|
"hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.", |
|
|
}, |
|
|
"Temporal Comparison": { |
|
|
"ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.", |
|
|
"hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.", |
|
|
}, |
|
|
"Discordant Reports": { |
|
|
"ref": "No acute cardiopulmonary process. Normal heart size and lung fields.", |
|
|
"hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.", |
|
|
}, |
|
|
"Ambiguous Language": { |
|
|
"ref": "There is a small left-sided pleural effusion with adjacent atelectasis.", |
|
|
"hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.", |
|
|
}, |
|
|
"Surgical Follow-up": { |
|
|
"ref": "Status post coronary artery bypass grafting. No evidence of acute complication.", |
|
|
"hyp": "Post-operative changes from CABG are present. No signs of surgical complication.", |
|
|
}, |
|
|
"False Positive": { |
|
|
"ref": "No focal consolidation, pleural effusion, or pneumothorax identified.", |
|
|
"hyp": "Right lower lobe consolidation concerning for pneumonia.", |
|
|
}, |
|
|
"Textual Hallucination": { |
|
|
"ref": "Heart and mediastinum are normal. Lungs are clear.", |
|
|
"hyp": "Large left pleural effusion with mediastinal shift to the right.", |
|
|
}, |
|
|
"Negation Challenge": { |
|
|
"ref": "No evidence of pneumothorax or pleural effusion.", |
|
|
"hyp": "Evidence of small pneumothorax on the right.", |
|
|
}, |
|
|
"Fine-grained Difference": { |
|
|
"ref": "Mild interstitial markings at the lung bases, likely chronic.", |
|
|
"hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.", |
|
|
} |
|
|
} |
|
|
|
|
|
def update_fields(choice): |
|
|
"""Update text fields based on example selection""" |
|
|
if choice == "Custom": |
|
|
return gr.update(value="", interactive=True), gr.update(value="", interactive=True) |
|
|
else: |
|
|
return ( |
|
|
gr.update(value=examples[choice]["ref"], interactive=False), |
|
|
gr.update(value=examples[choice]["hyp"], interactive=False) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
available_metrics = [ |
|
|
"BLEU", |
|
|
"ROUGE", |
|
|
"BERTScore", |
|
|
"Temporal F1", |
|
|
"RadEval BERTScore", |
|
|
"RaTEScore", |
|
|
"RadCliQ", |
|
|
"SRR-BERT", |
|
|
"CheXbert F1", |
|
|
"RadGraph F1", |
|
|
"GREEN" |
|
|
] |
|
|
|
|
|
|
|
|
default_metrics = ["BLEU", "ROUGE", "BERTScore"] |
|
|
|
|
|
|
|
|
with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# π©Ί RadEval: A framework for radiology text evaluation |
|
|
[Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval/) | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]() |
|
|
|
|
|
**RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box. |
|
|
|
|
|
**β οΈ Performance Warning β οΈ** |
|
|
|
|
|
The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
choice = gr.Radio( |
|
|
label="π Choose Example or Custom Input", |
|
|
choices=["Custom"] + list(examples.keys()), |
|
|
value="Custom", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
ref_input = gr.Textbox( |
|
|
label="π Reference Report (Ground Truth)", |
|
|
lines=5, |
|
|
placeholder="Enter the reference radiology report here...", |
|
|
info="The ground truth or expert-written report" |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
hyp_input = gr.Textbox( |
|
|
label="π€ Hypothesis Report (Generated)", |
|
|
lines=5, |
|
|
placeholder="Enter the generated/predicted radiology report here...", |
|
|
info="The AI-generated or system-produced report" |
|
|
) |
|
|
|
|
|
choice.change( |
|
|
update_fields, |
|
|
inputs=choice, |
|
|
outputs=[ref_input, hyp_input], |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
metrics_selection = gr.CheckboxGroup( |
|
|
label="π― Select Evaluation Metrics", |
|
|
choices=available_metrics, |
|
|
value=default_metrics, |
|
|
interactive=True, |
|
|
info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)." |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
run_button = gr.Button("π Run RadEval", variant="primary", size="lg") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
analysis_output = gr.Markdown( |
|
|
value="π **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'." |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
table_output = gr.DataFrame( |
|
|
label="π Detailed Scores", |
|
|
headers=["Metric", "Score"], |
|
|
wrap=True |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Accordion("π‘ Metric Information", open=False): |
|
|
gr.Markdown( |
|
|
""" |
|
|
### π Available Metrics: |
|
|
|
|
|
**Traditional NLG Metrics:** |
|
|
- **BLEU**: N-gram overlap between reference and hypothesis |
|
|
- **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L) |
|
|
- **BERTScore**: Semantic similarity using BERT embeddings |
|
|
|
|
|
**Radiology-Specific Metrics:** |
|
|
- **RadGraph F1**: Entity and relation extraction for radiology |
|
|
- **CheXbert F1**: Chest X-ray finding classification performance |
|
|
- **RaTEScore**: Radiology-aware text evaluation score |
|
|
- **RadCliQ**: Composite metric for radiology reports |
|
|
- **Temporal F1**: Temporal entity and relationship evaluation |
|
|
- **RadEval BERTScore**: Specialized BERT for radiology text |
|
|
- **GREEN**: Generative evaluation with natural language explanations |
|
|
- **SRR-BERT**: Structured radiology reasoning evaluation |
|
|
|
|
|
### β‘ Performance Notes: |
|
|
- **Fast**: BLEU, ROUGE, BERTScore, Temporal F1 |
|
|
- **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT |
|
|
- **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads) |
|
|
""" |
|
|
) |
|
|
|
|
|
run_button.click( |
|
|
run_radeval_simple, |
|
|
inputs=[ref_input, hyp_input, metrics_selection], |
|
|
outputs=[analysis_output, table_output] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|