Update app.py
Browse files
app.py
CHANGED
|
@@ -154,11 +154,10 @@ available_metrics = [
|
|
| 154 |
default_metrics = ["BLEU", "ROUGE", "BERTScore"]
|
| 155 |
|
| 156 |
|
| 157 |
-
with gr.Blocks(title="RadEval
|
| 158 |
gr.Markdown(
|
| 159 |
"""
|
| 160 |
-
#
|
| 161 |
-
[Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval/) | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
|
| 162 |
|
| 163 |
**RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
|
| 164 |
|
|
@@ -256,5 +255,410 @@ with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme
|
|
| 256 |
outputs=[analysis_output, table_output]
|
| 257 |
)
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
if __name__ == "__main__":
|
| 260 |
-
|
|
|
|
| 154 |
default_metrics = ["BLEU", "ROUGE", "BERTScore"]
|
| 155 |
|
| 156 |
|
| 157 |
+
with gr.Blocks(title="RadEval Evaluation", theme=gr.themes.Soft()) as demo:
|
| 158 |
gr.Markdown(
|
| 159 |
"""
|
| 160 |
+
# ποΈ RadEval Evaluation
|
|
|
|
| 161 |
|
| 162 |
**RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
|
| 163 |
|
|
|
|
| 255 |
outputs=[analysis_output, table_output]
|
| 256 |
)
|
| 257 |
|
| 258 |
+
# =============================================================================
|
| 259 |
+
# π§ͺ Hypothesis Testing Section
|
| 260 |
+
# =============================================================================
|
| 261 |
+
|
| 262 |
+
def run_hypothesis_testing(systems_data, selected_test_metrics, n_samples, significance_level):
|
| 263 |
+
"""
|
| 264 |
+
Run statistical significance testing between multiple systems
|
| 265 |
+
"""
|
| 266 |
+
try:
|
| 267 |
+
from RadEval import RadEval, compare_systems
|
| 268 |
+
|
| 269 |
+
# Parse systems data (expecting JSON format)
|
| 270 |
+
import json
|
| 271 |
+
systems_dict = json.loads(systems_data)
|
| 272 |
+
|
| 273 |
+
# Extract references and systems
|
| 274 |
+
if 'references' not in systems_dict or 'systems' not in systems_dict:
|
| 275 |
+
return "Error: Please provide both 'references' and 'systems' in the JSON data.", ""
|
| 276 |
+
|
| 277 |
+
references = systems_dict['references']
|
| 278 |
+
systems = systems_dict['systems']
|
| 279 |
+
|
| 280 |
+
# Validate data integrity
|
| 281 |
+
if not references or not systems:
|
| 282 |
+
return "Error: References and systems cannot be empty.", ""
|
| 283 |
+
|
| 284 |
+
if not isinstance(references, list) or not isinstance(systems, dict):
|
| 285 |
+
return "Error: References must be a list and systems must be a dictionary.", ""
|
| 286 |
+
|
| 287 |
+
# Check that all systems have the same number of outputs as references
|
| 288 |
+
ref_count = len(references)
|
| 289 |
+
for system_name, system_outputs in systems.items():
|
| 290 |
+
if not isinstance(system_outputs, list):
|
| 291 |
+
return f"Error: System '{system_name}' outputs must be a list.", ""
|
| 292 |
+
if len(system_outputs) != ref_count:
|
| 293 |
+
return f"Error: System '{system_name}' has {len(system_outputs)} outputs but {ref_count} references provided.", ""
|
| 294 |
+
|
| 295 |
+
# Validate that all texts are non-empty strings
|
| 296 |
+
for i, ref in enumerate(references):
|
| 297 |
+
if not isinstance(ref, str) or not ref.strip():
|
| 298 |
+
return f"Error: Reference {i+1} is empty or not a string.", ""
|
| 299 |
+
|
| 300 |
+
for system_name, system_outputs in systems.items():
|
| 301 |
+
for i, output in enumerate(system_outputs):
|
| 302 |
+
if not isinstance(output, str) or not output.strip():
|
| 303 |
+
return f"Error: System '{system_name}' output {i+1} is empty or not a string.", ""
|
| 304 |
+
|
| 305 |
+
# Initialize evaluators based on selected metrics (fast metrics only)
|
| 306 |
+
evaluators = {}
|
| 307 |
+
if 'BLEU' in selected_test_metrics:
|
| 308 |
+
evaluators['bleu'] = RadEval(do_bleu=True)
|
| 309 |
+
if 'ROUGE' in selected_test_metrics:
|
| 310 |
+
evaluators['rouge'] = RadEval(do_rouge=True)
|
| 311 |
+
if 'BERTScore' in selected_test_metrics:
|
| 312 |
+
evaluators['bertscore'] = RadEval(do_bertscore=True)
|
| 313 |
+
|
| 314 |
+
# Custom metric: average word count
|
| 315 |
+
def word_count_metric(hyps, refs):
|
| 316 |
+
return sum(len(report.split()) for report in hyps) / len(hyps)
|
| 317 |
+
|
| 318 |
+
# Build metrics dictionary (following the example structure)
|
| 319 |
+
metrics = {}
|
| 320 |
+
if 'BLEU' in selected_test_metrics:
|
| 321 |
+
# Test the evaluator first
|
| 322 |
+
try:
|
| 323 |
+
test_result = evaluators['bleu'](references[:1], [systems[list(systems.keys())[0]][0]])
|
| 324 |
+
if 'bleu' not in test_result:
|
| 325 |
+
return "Error: BLEU evaluator doesn't return 'bleu' key. Available keys: " + str(list(test_result.keys())), ""
|
| 326 |
+
metrics['bleu'] = lambda hyps, refs: evaluators['bleu'](refs, hyps)['bleu']
|
| 327 |
+
except Exception as bleu_error:
|
| 328 |
+
return f"Error testing BLEU evaluator: {str(bleu_error)}", ""
|
| 329 |
+
|
| 330 |
+
if 'ROUGE' in selected_test_metrics:
|
| 331 |
+
try:
|
| 332 |
+
test_result = evaluators['rouge'](references[:1], [systems[list(systems.keys())[0]][0]])
|
| 333 |
+
for rouge_key in ['rouge1', 'rouge2', 'rougeL']:
|
| 334 |
+
if rouge_key not in test_result:
|
| 335 |
+
return f"Error: ROUGE evaluator doesn't return '{rouge_key}' key. Available keys: " + str(list(test_result.keys())), ""
|
| 336 |
+
metrics['rouge1'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge1']
|
| 337 |
+
metrics['rouge2'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge2']
|
| 338 |
+
metrics['rougeL'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rougeL']
|
| 339 |
+
except Exception as rouge_error:
|
| 340 |
+
return f"Error testing ROUGE evaluator: {str(rouge_error)}", ""
|
| 341 |
+
|
| 342 |
+
if 'BERTScore' in selected_test_metrics:
|
| 343 |
+
try:
|
| 344 |
+
test_result = evaluators['bertscore'](references[:1], [systems[list(systems.keys())[0]][0]])
|
| 345 |
+
if 'bertscore' not in test_result:
|
| 346 |
+
return "Error: BERTScore evaluator doesn't return 'bertscore' key. Available keys: " + str(list(test_result.keys())), ""
|
| 347 |
+
metrics['bertscore'] = lambda hyps, refs: evaluators['bertscore'](refs, hyps)['bertscore']
|
| 348 |
+
except Exception as bert_error:
|
| 349 |
+
return f"Error testing BERTScore evaluator: {str(bert_error)}", ""
|
| 350 |
+
|
| 351 |
+
if 'Word Count' in selected_test_metrics:
|
| 352 |
+
metrics['word_count'] = word_count_metric # β example of a simple custom-defined metric
|
| 353 |
+
|
| 354 |
+
if not metrics:
|
| 355 |
+
return "Error: Please select at least one metric for testing.", ""
|
| 356 |
+
|
| 357 |
+
# Run significance tests
|
| 358 |
+
try:
|
| 359 |
+
signatures, scores = compare_systems(
|
| 360 |
+
systems=systems,
|
| 361 |
+
metrics=metrics,
|
| 362 |
+
references=references,
|
| 363 |
+
n_samples=int(n_samples),
|
| 364 |
+
significance_level=float(significance_level),
|
| 365 |
+
print_results=False # We don't need print output for online demo
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
except Exception as compare_error:
|
| 369 |
+
return f"Error during significance testing: {str(compare_error)}\n\nThis might be due to:\n1. Empty or invalid text content\n2. Incompatible metric configurations\n3. RadEval library issues", str(compare_error)
|
| 370 |
+
|
| 371 |
+
# Format results
|
| 372 |
+
results_text = "## π§ͺ Hypothesis Testing Results\n\n"
|
| 373 |
+
results_text += f"**Parameters:**\n"
|
| 374 |
+
results_text += f"- Randomization samples: {n_samples}\n"
|
| 375 |
+
results_text += f"- Significance level: {significance_level}\n"
|
| 376 |
+
results_text += f"- Number of systems: {len(systems)}\n"
|
| 377 |
+
results_text += f"- Number of references: {len(references)}\n\n"
|
| 378 |
+
|
| 379 |
+
# Significant differences summary
|
| 380 |
+
results_text += "### π Significant Differences Summary\n\n"
|
| 381 |
+
baseline_name = list(systems.keys())[0] # Assume first one is the baseline
|
| 382 |
+
results_text += f"**Baseline system:** {baseline_name}\n\n"
|
| 383 |
+
|
| 384 |
+
has_significant_differences = False
|
| 385 |
+
for system_name in systems.keys():
|
| 386 |
+
if system_name == baseline_name:
|
| 387 |
+
continue
|
| 388 |
+
|
| 389 |
+
significant_metrics = []
|
| 390 |
+
for metric_name in metrics.keys():
|
| 391 |
+
pvalue_key = f"{metric_name}_pvalue"
|
| 392 |
+
if pvalue_key in scores[system_name]:
|
| 393 |
+
p_val = scores[system_name][pvalue_key]
|
| 394 |
+
if p_val < float(significance_level):
|
| 395 |
+
significant_metrics.append(metric_name)
|
| 396 |
+
|
| 397 |
+
if significant_metrics:
|
| 398 |
+
results_text += f"**{system_name} vs {baseline_name}:** {', '.join(significant_metrics)} (p < {significance_level})\n\n"
|
| 399 |
+
has_significant_differences = True
|
| 400 |
+
else:
|
| 401 |
+
results_text += f"**{system_name} vs {baseline_name}:** No significant differences\n\n"
|
| 402 |
+
|
| 403 |
+
if not has_significant_differences:
|
| 404 |
+
results_text += "*No statistically significant differences found between systems.*\n\n"
|
| 405 |
+
|
| 406 |
+
# Add mean scores in table format
|
| 407 |
+
results_text += "### π Mean Scores by System\n\n"
|
| 408 |
+
try:
|
| 409 |
+
baseline_name = list(systems.keys())[0]
|
| 410 |
+
|
| 411 |
+
# Display each system's results in a clean format
|
| 412 |
+
for system_name in systems.keys():
|
| 413 |
+
results_text += f"**{system_name.upper()}:**\n\n"
|
| 414 |
+
|
| 415 |
+
# Create table header
|
| 416 |
+
results_text += "| Metric | Score | P-value |\n"
|
| 417 |
+
results_text += "|--------|-------|----------|\n"
|
| 418 |
+
|
| 419 |
+
# Get system data from scores
|
| 420 |
+
system_scores = scores.get(system_name, {})
|
| 421 |
+
|
| 422 |
+
# Add rows for each metric
|
| 423 |
+
for metric_name in metrics.keys():
|
| 424 |
+
if metric_name in system_scores:
|
| 425 |
+
score = system_scores[metric_name]
|
| 426 |
+
pvalue_key = f"{metric_name}_pvalue"
|
| 427 |
+
|
| 428 |
+
# Format score
|
| 429 |
+
score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score)
|
| 430 |
+
|
| 431 |
+
# Format p-value (only for non-baseline systems)
|
| 432 |
+
if system_name != baseline_name and pvalue_key in system_scores:
|
| 433 |
+
pvalue = system_scores[pvalue_key]
|
| 434 |
+
pvalue_str = f"{pvalue:.4f}" if isinstance(pvalue, (int, float)) else str(pvalue)
|
| 435 |
+
# Mark significant p-values
|
| 436 |
+
if isinstance(pvalue, (int, float)) and pvalue < float(significance_level):
|
| 437 |
+
pvalue_str += " *"
|
| 438 |
+
else:
|
| 439 |
+
pvalue_str = "-" if system_name == baseline_name else "N/A"
|
| 440 |
+
|
| 441 |
+
results_text += f"| {metric_name} | {score_str} | {pvalue_str} |\n"
|
| 442 |
+
|
| 443 |
+
results_text += "\n"
|
| 444 |
+
|
| 445 |
+
results_text += "*Note: Baseline system shows scores only. Other systems show scores and p-values comparing to baseline.*\n"
|
| 446 |
+
results_text += f"*P-values marked with * are significant (p < {significance_level}).*\n\n"
|
| 447 |
+
|
| 448 |
+
except Exception as score_error:
|
| 449 |
+
results_text += f"Error formatting scores: {str(score_error)}\n\n"
|
| 450 |
+
|
| 451 |
+
return results_text
|
| 452 |
+
|
| 453 |
+
except ImportError as e:
|
| 454 |
+
return f"Import Error: {str(e)}. Please ensure RadEval with compare_systems is installed."
|
| 455 |
+
except json.JSONDecodeError:
|
| 456 |
+
return "Error: Invalid JSON format in systems data."
|
| 457 |
+
except Exception as e:
|
| 458 |
+
return f"Testing Error: {str(e)}"
|
| 459 |
+
|
| 460 |
+
# Create Hypothesis Testing UI
|
| 461 |
+
with gr.Blocks(title="Null Hypothesis Testing", theme=gr.themes.Soft()) as hypothesis_demo:
|
| 462 |
+
gr.Markdown(
|
| 463 |
+
"""
|
| 464 |
+
# π₯οΈ Null Hypothesis Testing
|
| 465 |
+
|
| 466 |
+
**Statistical significance testing** for comparing multiple radiology report generation systems.
|
| 467 |
+
This tool uses **randomization-based significance testing** to determine if differences between systems are statistically meaningful.
|
| 468 |
+
|
| 469 |
+
**β οΈ Performance Warning β οΈ**
|
| 470 |
+
|
| 471 |
+
Hypothesis testing with multiple metrics may take some time, especially with larger sample sizes. Please be patient during computation.
|
| 472 |
+
"""
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
with gr.Row():
|
| 476 |
+
with gr.Column(scale=1.5):
|
| 477 |
+
systems_input = gr.Textbox(
|
| 478 |
+
label="π Systems Data (JSON Format)",
|
| 479 |
+
lines=18,
|
| 480 |
+
placeholder="""Enter systems data in JSON format, e.g.:
|
| 481 |
+
{
|
| 482 |
+
"references": [
|
| 483 |
+
"No acute cardiopulmonary process.",
|
| 484 |
+
"Mild cardiomegaly with clear lung fields."
|
| 485 |
+
],
|
| 486 |
+
"systems": {
|
| 487 |
+
"baseline": [
|
| 488 |
+
"No acute findings.",
|
| 489 |
+
"Mild cardiomegaly, clear lungs."
|
| 490 |
+
],
|
| 491 |
+
"improved": [
|
| 492 |
+
"No acute cardiopulmonary process.",
|
| 493 |
+
"Mild cardiomegaly with clear lung fields bilaterally."
|
| 494 |
+
]
|
| 495 |
+
}
|
| 496 |
+
}""",
|
| 497 |
+
info="Provide reference reports and multiple systems to compare"
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
+
with gr.Column(scale=1):
|
| 501 |
+
test_metrics_selection = gr.CheckboxGroup(
|
| 502 |
+
label="π― Select Metrics for Testing",
|
| 503 |
+
choices=["BLEU", "ROUGE", "BERTScore", "Word Count"],
|
| 504 |
+
value=["BLEU", "ROUGE", "BERTScore"],
|
| 505 |
+
interactive=True,
|
| 506 |
+
info="Only fast metrics are shown to ensure quick evaluation (slow ones are excluded)"
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
n_samples_input = gr.Number(
|
| 510 |
+
label="π Randomization Samples",
|
| 511 |
+
value=50,
|
| 512 |
+
minimum=10,
|
| 513 |
+
maximum=1000,
|
| 514 |
+
step=10,
|
| 515 |
+
info="Number of randomisation samples (higher = more confidence, but slower)"
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
significance_level_input = gr.Number(
|
| 519 |
+
label="π Significance Level (Ξ±)",
|
| 520 |
+
value=0.05,
|
| 521 |
+
minimum=0.01,
|
| 522 |
+
maximum=0.10,
|
| 523 |
+
step=0.01,
|
| 524 |
+
info="Alpha level for significance testing"
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
example_button = gr.Button("π Load Example Data", variant="secondary")
|
| 528 |
+
clear_button = gr.Button("ποΈ Clear Data", variant="secondary")
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
with gr.Row():
|
| 532 |
+
test_button = gr.Button("π§ͺ Run Hypothesis Testing", variant="primary", size="lg")
|
| 533 |
+
|
| 534 |
+
with gr.Row():
|
| 535 |
+
test_results = gr.Markdown(
|
| 536 |
+
value="π **Test results will appear here...**\n\nClick 'Load Example Data' to see sample input, then click 'Run Hypothesis Testing' to see results."
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
# Example data button
|
| 540 |
+
def load_example_data():
|
| 541 |
+
example_data = {
|
| 542 |
+
"references": [
|
| 543 |
+
"No acute cardiopulmonary process.",
|
| 544 |
+
"No radiographic findings to suggest pneumonia.",
|
| 545 |
+
"Mild cardiomegaly with clear lung fields.",
|
| 546 |
+
"Small pleural effusion on the right side.",
|
| 547 |
+
"Status post cardiac surgery with stable appearance."
|
| 548 |
+
],
|
| 549 |
+
"systems": {
|
| 550 |
+
"baseline": [
|
| 551 |
+
"No acute findings.",
|
| 552 |
+
"No pneumonia.",
|
| 553 |
+
"Mild cardiomegaly, clear lungs.",
|
| 554 |
+
"Small right pleural effusion.",
|
| 555 |
+
"Post-cardiac surgery, stable."
|
| 556 |
+
],
|
| 557 |
+
"improved": [
|
| 558 |
+
"No acute cardiopulmonary process.",
|
| 559 |
+
"No radiographic findings suggesting pneumonia.",
|
| 560 |
+
"Mild cardiomegaly with clear lung fields bilaterally.",
|
| 561 |
+
"Small pleural effusion present on the right side.",
|
| 562 |
+
"Status post cardiac surgery with stable appearance."
|
| 563 |
+
],
|
| 564 |
+
"poor": [
|
| 565 |
+
"Normal.",
|
| 566 |
+
"OK.",
|
| 567 |
+
"Heart big.",
|
| 568 |
+
"Some fluid.",
|
| 569 |
+
"Surgery done."
|
| 570 |
+
]
|
| 571 |
+
}
|
| 572 |
+
}
|
| 573 |
+
import json
|
| 574 |
+
return json.dumps(example_data, indent=2)
|
| 575 |
+
|
| 576 |
+
example_button.click(
|
| 577 |
+
load_example_data,
|
| 578 |
+
outputs=systems_input
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
clear_button.click(
|
| 582 |
+
lambda: "",
|
| 583 |
+
outputs=systems_input
|
| 584 |
+
)
|
| 585 |
+
|
| 586 |
+
test_button.click(
|
| 587 |
+
run_hypothesis_testing,
|
| 588 |
+
inputs=[systems_input, test_metrics_selection, n_samples_input, significance_level_input],
|
| 589 |
+
outputs=[test_results]
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
with gr.Accordion("π‘ Hypothesis Testing Information", open=False):
|
| 593 |
+
gr.Markdown(
|
| 594 |
+
"""
|
| 595 |
+
### π¬ How it Works:
|
| 596 |
+
|
| 597 |
+
This tool performs **randomization-based significance testing** to compare multiple systems:
|
| 598 |
+
|
| 599 |
+
1. **Null Hypothesis**: No difference between systems
|
| 600 |
+
2. **Randomization**: Randomly permute system outputs multiple times
|
| 601 |
+
3. **P-value Calculation**: Proportion of permutations where random difference β₯ observed difference
|
| 602 |
+
4. **Significance**: If p-value < Ξ±, reject null hypothesis (systems are significantly different)
|
| 603 |
+
|
| 604 |
+
### π Input Format:
|
| 605 |
+
- **References**: Ground truth reports
|
| 606 |
+
- **Systems**: Multiple systems to compare (each with same number of outputs as references)
|
| 607 |
+
- **Metrics**: Evaluation metrics to use for comparison
|
| 608 |
+
|
| 609 |
+
### π Output:
|
| 610 |
+
- **Significance Matrix**: P-values for all pairwise system comparisons
|
| 611 |
+
- **Mean Scores**: Average performance of each system on each metric
|
| 612 |
+
- **Bold p-values**: Indicate statistically significant differences
|
| 613 |
+
|
| 614 |
+
### β‘ Performance:
|
| 615 |
+
- **Fast Metrics Only**: This tool only includes BLEU, ROUGE, BERTScore, and Word Count for optimal performance
|
| 616 |
+
- **Excluded Slow Metrics**: RadGraph F1, CheXbert F1 are excluded to ensure reasonable computation time
|
| 617 |
+
- More randomization samples = more accurate p-values but slower computation
|
| 618 |
+
- Recommended: 50-100 samples for quick testing, 1000+ for publication
|
| 619 |
+
"""
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
# Combine both demos using gr.Blocks to add a header
|
| 623 |
+
with gr.Blocks(
|
| 624 |
+
title="RadEval: A framework for radiology text evaluation",
|
| 625 |
+
theme=gr.themes.Soft(),
|
| 626 |
+
css="""
|
| 627 |
+
.tab-nav button {
|
| 628 |
+
font-weight: bold !important;
|
| 629 |
+
border: 2px solid #e0e7ff !important;
|
| 630 |
+
border-radius: 10px !important;
|
| 631 |
+
margin: 0 5px !important;
|
| 632 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
| 633 |
+
color: white !important;
|
| 634 |
+
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important;
|
| 635 |
+
transition: all 0.3s ease !important;
|
| 636 |
+
}
|
| 637 |
+
.tab-nav button:hover {
|
| 638 |
+
transform: translateY(-2px) !important;
|
| 639 |
+
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3) !important;
|
| 640 |
+
background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important;
|
| 641 |
+
}
|
| 642 |
+
.tab-nav button.selected {
|
| 643 |
+
background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%) !important;
|
| 644 |
+
border-color: #ff6b6b !important;
|
| 645 |
+
transform: translateY(-1px) !important;
|
| 646 |
+
box-shadow: 0 8px 25px rgba(255, 107, 107, 0.4) !important;
|
| 647 |
+
}
|
| 648 |
+
"""
|
| 649 |
+
) as combined_demo:
|
| 650 |
+
gr.Markdown(
|
| 651 |
+
"""
|
| 652 |
+
# π©Ί RadEval: A framework for radiology text evaluation
|
| 653 |
+
### [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval) | [Video](https://justin13601.github.io/files/radeval.mp4) | [arXiv]() | [RadEval_ModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
|
| 654 |
+
|
| 655 |
+
"""
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
+
tabs = gr.TabbedInterface(
|
| 659 |
+
[demo, hypothesis_demo],
|
| 660 |
+
["ποΈ RadEval Evaluation", "π₯οΈ Null Hypothesis Testing"]
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
if __name__ == "__main__":
|
| 664 |
+
combined_demo.launch()
|