zhenwu0831 commited on
Commit
85a9e79
Β·
1 Parent(s): fd3a49e
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -174,7 +174,7 @@ def load_gold_map() -> Dict[str, Dict[str, Any]]:
174
  if not qid:
175
  continue
176
  gold[qid] = {
177
- "question": str(ex.get("question", ex.get("query", ""))).strip(), # May be empty
178
  "gold_answer": str(ex.get("answer", ex.get("gold", ex.get("reference", "")))).strip(),
179
  }
180
 
@@ -475,11 +475,7 @@ def process_submission(file):
475
  f1 = token_f1(pred, gold)
476
  rec = answer_recall(pred, gold)
477
  rouge = compute_rouge(pred, gold)
478
-
479
- # Only use LLM judge if question is provided
480
- judge = None
481
- if question: # Skip judge if question is empty
482
- judge = openai_judge(question, pred)
483
 
484
  em_sum += em
485
  f1_sum += f1
@@ -491,6 +487,7 @@ def process_submission(file):
491
  if judge is not None:
492
  judge_sum += int(judge)
493
  judge_n += 1
 
494
 
495
  denom = attempted if attempted > 0 else 1
496
 
@@ -550,7 +547,10 @@ def process_submission(file):
550
  f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
551
  ]
552
  if avg_judge is None:
553
- lines.append("LLM judge: NA (set OPENAI_API to enable)")
 
 
 
554
  else:
555
  lines.append(f"LLM judge: {avg_judge:.3f} (1-5)")
556
 
@@ -596,11 +596,11 @@ We compute multiple metrics:
596
 
597
  with gr.Tabs():
598
  with gr.Tab("πŸ“€ Submit"):
599
- file_input = gr.File(label="Upload submission in json", file_types=[".json"])
600
  submit_btn = gr.Button("πŸš€ Submit & Evaluate", variant="primary")
601
  status = gr.Textbox(label="Result", lines=10, interactive=False)
602
 
603
- gr.Markdown("### Sample submission")
604
  sample = gr.Textbox(value=sample_submission_text(), lines=6)
605
 
606
  with gr.Tab("πŸ… Leaderboard"):
 
174
  if not qid:
175
  continue
176
  gold[qid] = {
177
+ "question": str(ex.get("question", ex.get("query", ""))).strip(),
178
  "gold_answer": str(ex.get("answer", ex.get("gold", ex.get("reference", "")))).strip(),
179
  }
180
 
 
475
  f1 = token_f1(pred, gold)
476
  rec = answer_recall(pred, gold)
477
  rouge = compute_rouge(pred, gold)
478
+ judge = openai_judge(question, pred)
 
 
 
 
479
 
480
  em_sum += em
481
  f1_sum += f1
 
487
  if judge is not None:
488
  judge_sum += int(judge)
489
  judge_n += 1
490
+ judge_n += 1
491
 
492
  denom = attempted if attempted > 0 else 1
493
 
 
547
  f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
548
  ]
549
  if avg_judge is None:
550
+ if not OPENAI_API_KEY:
551
+ lines.append("LLM judge: NA (set OPENAI_API to enable)")
552
+ else:
553
+ lines.append("LLM judge: NA (questions not available in gold dataset)")
554
  else:
555
  lines.append(f"LLM judge: {avg_judge:.3f} (1-5)")
556
 
 
596
 
597
  with gr.Tabs():
598
  with gr.Tab("πŸ“€ Submit"):
599
+ file_input = gr.File(label="Upload submission.json", file_types=[".json"])
600
  submit_btn = gr.Button("πŸš€ Submit & Evaluate", variant="primary")
601
  status = gr.Textbox(label="Result", lines=10, interactive=False)
602
 
603
+ gr.Markdown("### Sample submission.json")
604
  sample = gr.Textbox(value=sample_submission_text(), lines=6)
605
 
606
  with gr.Tab("πŸ… Leaderboard"):