Lasha commited on
Commit
12f68b6
·
1 Parent(s): cbe9164

MMOU Eval

Browse files
Files changed (2) hide show
  1. app.py +562 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import time
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass
8
+ from functools import lru_cache
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ if "GRADIO_TEMP_DIR" not in os.environ:
13
+ for candidate in (
14
+ Path(__file__).resolve().parent / ".gradio_tmp",
15
+ Path.cwd() / ".gradio_tmp",
16
+ Path("/tmp") / "gradio",
17
+ ):
18
+ try:
19
+ candidate.mkdir(parents=True, exist_ok=True)
20
+ probe = candidate / ".write_probe"
21
+ probe.write_text("ok", encoding="utf-8")
22
+ probe.unlink()
23
+ os.environ["GRADIO_TEMP_DIR"] = str(candidate)
24
+ break
25
+ except OSError:
26
+ continue
27
+
28
+ import gradio as gr
29
+ import pandas as pd
30
+ from huggingface_hub import hf_hub_download
31
+
32
+
33
+ DEFAULT_GT_LOCAL_PATH = ""
34
+ DEFAULT_GT_REPO_ID = "nvidia/mmou-gt"
35
+ DEFAULT_GT_FILENAME = "MMOU.json"
36
+ DEFAULT_GT_REPO_TYPE = "dataset"
37
+ DEFAULT_GT_TOKEN_ENV = "HF_TOKEN"
38
+
39
+ DOMAINS_ORDER = [
40
+ "Sports",
41
+ "Travel",
42
+ "Video Games",
43
+ "Daily Life",
44
+ "Academic Lectures",
45
+ "Film",
46
+ "Pranks",
47
+ "Music",
48
+ "Animation",
49
+ "News",
50
+ ]
51
+ DURATION_BUCKET_ORDER = ["< 5", "5–10", "10–20", "20–30", "> 30", "Overall"]
52
+ GT_LETTER_KEYS = (
53
+ "correct_option_letter",
54
+ "correct_answer_letter",
55
+ "label",
56
+ "gold_label",
57
+ "answer_letter",
58
+ )
59
+ GT_DOMAIN_KEYS = ("domain", "category")
60
+ GT_DURATION_KEYS = ("video_duration", "video_duration_sec", "duration", "duration_sec")
61
+ GT_SKILL_KEYS = ("question_type", "skills", "skill", "question_types")
62
+ OPTION_LETTERS = set("ABCDEFGHIJ")
63
+
64
+ APP_INTRO = """
65
+ # MMOU Evaluator
66
+
67
+ Upload a `.json` or `.jsonl` file where each entry contains `question_id` and `answer`.
68
+ """
69
+
70
+ FORMAT_GUIDE = """
71
+ ### Submission Format
72
+
73
+ Each entry must contain:
74
+
75
+ - `question_id`
76
+ - `answer`
77
+
78
+ `answer` must be a single letter from `A` to `J`. Letter matching is case-insensitive. Extra keys are ignored.
79
+ Rows with empty or `null` answers are ignored.
80
+
81
+ Example JSON:
82
+
83
+ ```json
84
+ [
85
+ {"question_id": "54aaef4d-2c22-476e-a7e7-37efabde2520", "answer": "C"},
86
+ {"question_id": "a7f8790d-7828-4ece-a63a-a5d13edf9026", "answer": "B"}
87
+ ]
88
+ ```
89
+
90
+ Example JSONL:
91
+
92
+ ```json
93
+ {"question_id": "54aaef4d-2c22-476e-a7e7-37efabde2520", "answer": "C"}
94
+ {"question_id": "a7f8790d-7828-4ece-a63a-a5d13edf9026", "answer": "B"}
95
+ ```
96
+ """
97
+
98
+ READY_STATUS_MARKDOWN = "### Ready\nUpload a prediction file and click `Evaluate`."
99
+ EMPTY_SUMMARY_MARKDOWN = """
100
+ ### Summary
101
+
102
+ Run an evaluation to populate the aggregate summary.
103
+ """
104
+
105
+ LAYOUT_CSS = """
106
+ .gradio-container {
107
+ max-width: 1100px !important;
108
+ margin: 0 auto !important;
109
+ padding-left: 1rem !important;
110
+ padding-right: 1rem !important;
111
+ font-size: 16px !important;
112
+ }
113
+
114
+ .gradio-container .prose,
115
+ .gradio-container .gr-markdown,
116
+ .gradio-container .gr-dataframe,
117
+ .gradio-container label,
118
+ .gradio-container button,
119
+ .gradio-container input,
120
+ .gradio-container textarea {
121
+ font-size: 1rem !important;
122
+ }
123
+ """
124
+
125
+
126
+ @dataclass(frozen=True)
127
+ class GroundTruthEntry:
128
+ correct_letter: str
129
+ domain: str
130
+ video_duration_sec: float | None
131
+ skills: tuple[str, ...]
132
+
133
+
134
+ def stringify(value: Any) -> str:
135
+ if value is None:
136
+ return ""
137
+ if isinstance(value, str):
138
+ return value.strip()
139
+ if isinstance(value, (int, float, bool)):
140
+ return str(value)
141
+ return json.dumps(value, ensure_ascii=True)
142
+
143
+
144
+ def coerce_float(value: Any) -> float | None:
145
+ if value is None or value == "":
146
+ return None
147
+ if isinstance(value, (int, float)):
148
+ return float(value)
149
+ if isinstance(value, str):
150
+ try:
151
+ return float(value.strip())
152
+ except ValueError:
153
+ return None
154
+ return None
155
+
156
+
157
+ def first_present(record: dict[str, Any], keys: tuple[str, ...]) -> Any:
158
+ return next((record[key] for key in keys if record.get(key) not in (None, "", [])), None)
159
+
160
+
161
+ def parse_skill_list(value: Any) -> tuple[str, ...]:
162
+ items = value if isinstance(value, list) else ([] if value is None else [value])
163
+ cleaned: list[str] = []
164
+ seen: set[str] = set()
165
+ for item in items:
166
+ text = stringify(item).strip().strip("\"'")
167
+ if text and text not in seen:
168
+ seen.add(text)
169
+ cleaned.append(text)
170
+ return tuple(cleaned)
171
+
172
+
173
+ def safe_pct(correct: int, total: int) -> float:
174
+ return (100.0 * correct / total) if total else 0.0
175
+
176
+
177
+ def duration_bucket(minutes: float) -> str:
178
+ if minutes < 5:
179
+ return "< 5"
180
+ if minutes < 10:
181
+ return "5–10"
182
+ if minutes < 20:
183
+ return "10–20"
184
+ if minutes < 30:
185
+ return "20–30"
186
+ return "> 30"
187
+
188
+
189
+ def normalize_answer(value: Any) -> str:
190
+ answer = stringify(value).upper()
191
+ if not answer:
192
+ return ""
193
+ if len(answer) != 1 or answer not in OPTION_LETTERS:
194
+ raise ValueError("Each `answer` must be a single letter from A to J.")
195
+ return answer
196
+
197
+
198
+ def load_records(path: str | Path, *, allow_data_key: bool = False) -> tuple[list[dict[str, Any]], str]:
199
+ file_path = Path(path)
200
+ suffix = file_path.suffix.lower()
201
+
202
+ if suffix in {".jsonl", ".ndjson"}:
203
+ records: list[dict[str, Any]] = []
204
+ with file_path.open("r", encoding="utf-8") as handle:
205
+ for line_number, line in enumerate(handle, start=1):
206
+ if not line.strip():
207
+ continue
208
+ record = json.loads(line)
209
+ if not isinstance(record, dict):
210
+ raise ValueError(f"Line {line_number} in JSONL must be an object.")
211
+ records.append(record)
212
+ return records, "jsonl"
213
+
214
+ with file_path.open("r", encoding="utf-8") as handle:
215
+ payload = json.load(handle)
216
+
217
+ if isinstance(payload, list):
218
+ records = payload
219
+ elif allow_data_key and isinstance(payload, dict) and isinstance(payload.get("data"), list):
220
+ records = payload["data"]
221
+ else:
222
+ raise ValueError("JSON file must contain a list of objects.")
223
+
224
+ if not all(isinstance(item, dict) for item in records):
225
+ raise ValueError("JSON file must contain only objects.")
226
+
227
+ return records, "json"
228
+
229
+
230
+ def materialize_ground_truth_file() -> Path:
231
+ local_path = os.getenv("MMOU_GT_PATH", DEFAULT_GT_LOCAL_PATH).strip()
232
+ if local_path:
233
+ path = Path(local_path)
234
+ if not path.exists():
235
+ raise FileNotFoundError(
236
+ "MMOU_GT_PATH is set, but the file does not exist. "
237
+ "Update the configured path or mount the private file correctly."
238
+ )
239
+ return path
240
+
241
+ repo_id = os.getenv("MMOU_GT_REPO_ID", DEFAULT_GT_REPO_ID).strip()
242
+ filename = os.getenv("MMOU_GT_FILENAME", DEFAULT_GT_FILENAME).strip()
243
+ if repo_id and filename:
244
+ repo_type = os.getenv("MMOU_GT_REPO_TYPE", DEFAULT_GT_REPO_TYPE).strip() or "dataset"
245
+ token_env = os.getenv("MMOU_GT_TOKEN_ENV", DEFAULT_GT_TOKEN_ENV).strip() or "HF_TOKEN"
246
+ token = os.getenv(token_env) or os.getenv("HF_TOKEN", "")
247
+ return Path(
248
+ hf_hub_download(
249
+ repo_id=repo_id,
250
+ filename=filename,
251
+ repo_type=repo_type,
252
+ token=token or None,
253
+ )
254
+ )
255
+
256
+ raise RuntimeError(
257
+ "Ground truth is not configured. Set MMOU_GT_PATH or "
258
+ "MMOU_GT_REPO_ID/MMOU_GT_FILENAME before launching the app."
259
+ )
260
+
261
+
262
+ @lru_cache(maxsize=1)
263
+ def load_ground_truth() -> dict[str, GroundTruthEntry]:
264
+ records, _ = load_records(materialize_ground_truth_file(), allow_data_key=True)
265
+ entries: dict[str, GroundTruthEntry] = {}
266
+
267
+ for record in records:
268
+ question_id = stringify(record.get("question_id"))
269
+ if not question_id:
270
+ continue
271
+
272
+ correct_letter = next(
273
+ (
274
+ letter
275
+ for key in GT_LETTER_KEYS
276
+ if (letter := stringify(record.get(key)).upper()) in OPTION_LETTERS
277
+ ),
278
+ "",
279
+ )
280
+ if not correct_letter:
281
+ continue
282
+
283
+ entries[question_id] = GroundTruthEntry(
284
+ correct_letter=correct_letter,
285
+ domain=stringify(first_present(record, GT_DOMAIN_KEYS)) or "Unknown",
286
+ video_duration_sec=coerce_float(first_present(record, GT_DURATION_KEYS)),
287
+ skills=parse_skill_list(first_present(record, GT_SKILL_KEYS)),
288
+ )
289
+
290
+ if not entries:
291
+ raise RuntimeError("No usable ground-truth question IDs were found.")
292
+
293
+ return entries
294
+
295
+
296
+ def build_prediction_map(records: list[dict[str, Any]]) -> tuple[dict[str, str], int, int]:
297
+ predictions: dict[str, str] = {}
298
+ duplicates = 0
299
+ skipped_empty_answers = 0
300
+
301
+ for index, record in enumerate(records, start=1):
302
+ question_id = stringify(record.get("question_id"))
303
+ if not question_id:
304
+ raise ValueError(f"Row {index} is missing `question_id`.")
305
+ answer = normalize_answer(record.get("answer"))
306
+ if not answer:
307
+ skipped_empty_answers += 1
308
+ continue
309
+ if question_id in predictions:
310
+ duplicates += 1
311
+ predictions[question_id] = answer
312
+
313
+ return predictions, duplicates, skipped_empty_answers
314
+
315
+
316
+ def bump(stats: dict[str, dict[str, int]], keys: list[str], field: str) -> None:
317
+ for key in keys:
318
+ stats[key][field] += 1
319
+
320
+
321
+ def make_breakdown_dataframe(
322
+ stats: dict[str, dict[str, int]],
323
+ label: str,
324
+ ordered_labels: list[str] | None = None,
325
+ ) -> pd.DataFrame:
326
+ rows = [
327
+ {
328
+ label: name,
329
+ "Official Accuracy (%)": round(safe_pct(counts["correct"], counts["total"]), 2),
330
+ "Answered Accuracy (%)": round(safe_pct(counts["correct"], counts["answered"]), 2),
331
+ "Coverage (%)": round(safe_pct(counts["answered"], counts["total"]), 2),
332
+ "Correct": counts["correct"],
333
+ "Answered": counts["answered"],
334
+ "Total": counts["total"],
335
+ }
336
+ for name, counts in stats.items()
337
+ ]
338
+
339
+ if not rows:
340
+ return pd.DataFrame(
341
+ columns=[
342
+ label,
343
+ "Official Accuracy (%)",
344
+ "Answered Accuracy (%)",
345
+ "Coverage (%)",
346
+ "Correct",
347
+ "Answered",
348
+ "Total",
349
+ ]
350
+ )
351
+
352
+ frame = pd.DataFrame(rows)
353
+ if ordered_labels:
354
+ rank = {name: idx for idx, name in enumerate(ordered_labels)}
355
+ frame["_rank"] = frame[label].map(lambda name: rank.get(name, len(rank)))
356
+ return frame.sort_values(["_rank", label]).drop(columns="_rank").reset_index(drop=True)
357
+
358
+ return frame.sort_values(["Answered Accuracy (%)", "Total"], ascending=[False, False]).reset_index(drop=True)
359
+
360
+
361
+ def build_metrics_markdown(summary: dict[str, Any]) -> str:
362
+ return "\n".join(
363
+ [
364
+ "### Metrics",
365
+ f"- Official accuracy: `{summary['official_accuracy_pct']:.2f}%` "
366
+ f"(`{summary['correct']} / {summary['total_ground_truth']}`)",
367
+ f"- Answered accuracy: `{summary['answered_accuracy_pct']:.2f}%` "
368
+ f"(`{summary['correct']} / {summary['answered_predictions']}`)",
369
+ f"- Coverage: `{summary['coverage_pct']:.2f}%`",
370
+ f"- Matched IDs: `{summary['matched_prediction_ids']}`",
371
+ f"- Missing IDs: `{summary['missing_prediction_ids']}`",
372
+ f"- Extra IDs: `{summary['extra_prediction_ids']}`",
373
+ f"- Duplicate IDs: `{summary['duplicate_prediction_ids']}`",
374
+ ]
375
+ )
376
+
377
+
378
+ def build_summary_markdown(domain_df: pd.DataFrame, duration_df: pd.DataFrame, skill_df: pd.DataFrame) -> str:
379
+ accuracy_column = "Answered Accuracy (%)"
380
+ best_domain = "n/a"
381
+ best_duration = "n/a"
382
+ lowest_skill = "n/a"
383
+
384
+ if not domain_df.empty:
385
+ row = domain_df.sort_values([accuracy_column, "Total"], ascending=[False, False]).iloc[0]
386
+ best_domain = f"{row['Domain']} ({row[accuracy_column]:.2f}%)"
387
+
388
+ if not duration_df.empty:
389
+ rows = duration_df[duration_df["Duration Bucket"] != "Overall"]
390
+ if not rows.empty:
391
+ row = rows.sort_values([accuracy_column, "Total"], ascending=[False, False]).iloc[0]
392
+ best_duration = f"{row['Duration Bucket']} ({row[accuracy_column]:.2f}%)"
393
+
394
+ if not skill_df.empty:
395
+ rows = skill_df[skill_df["Total"] >= 10]
396
+ if rows.empty:
397
+ rows = skill_df
398
+ row = rows.sort_values([accuracy_column, "Total"], ascending=[True, False]).iloc[0]
399
+ lowest_skill = f"{row['Skill']} ({row[accuracy_column]:.2f}%)"
400
+
401
+ return "\n".join(
402
+ [
403
+ "### Summary",
404
+ f"- Best domain by answered accuracy: `{best_domain}`",
405
+ f"- Best duration bucket by answered accuracy: `{best_duration}`",
406
+ f"- Lowest skill bucket by answered accuracy: `{lowest_skill}`",
407
+ ]
408
+ )
409
+
410
+
411
+ def empty_result(status: str) -> tuple[str, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
412
+ return status, "", EMPTY_SUMMARY_MARKDOWN, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
413
+
414
+
415
+ def evaluate_submission(
416
+ prediction_file: str | None,
417
+ ) -> tuple[str, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
418
+ if not prediction_file:
419
+ return empty_result(
420
+ "### Upload required\nPlease upload a `.json` or `.jsonl` prediction file before evaluating."
421
+ )
422
+
423
+ started_at = time.time()
424
+
425
+ try:
426
+ ground_truth = load_ground_truth()
427
+ records, file_format = load_records(prediction_file)
428
+ if not records:
429
+ raise ValueError("No valid prediction records were found in the uploaded file.")
430
+
431
+ predictions, duplicate_prediction_ids, skipped_empty_answers = build_prediction_map(records)
432
+ domain_stats: dict[str, dict[str, int]] = defaultdict(lambda: {"correct": 0, "answered": 0, "total": 0})
433
+ duration_stats: dict[str, dict[str, int]] = defaultdict(lambda: {"correct": 0, "answered": 0, "total": 0})
434
+ skill_stats: dict[str, dict[str, int]] = defaultdict(lambda: {"correct": 0, "answered": 0, "total": 0})
435
+
436
+ correct = 0
437
+ answered = 0
438
+ gt_ids = set(ground_truth)
439
+ pred_ids = set(predictions)
440
+
441
+ for question_id, gt in ground_truth.items():
442
+ duration_key = duration_bucket(gt.video_duration_sec / 60.0) if gt.video_duration_sec is not None else None
443
+ scopes = [
444
+ (domain_stats, [gt.domain]),
445
+ (duration_stats, [duration_key] if duration_key else []),
446
+ (skill_stats, list(gt.skills)),
447
+ ]
448
+
449
+ for stats, keys in scopes:
450
+ bump(stats, keys, "total")
451
+
452
+ answer = predictions.get(question_id)
453
+ if not answer:
454
+ continue
455
+
456
+ answered += 1
457
+ for stats, keys in scopes:
458
+ bump(stats, keys, "answered")
459
+
460
+ if answer == gt.correct_letter:
461
+ correct += 1
462
+ for stats, keys in scopes:
463
+ bump(stats, keys, "correct")
464
+
465
+ total_ground_truth = len(ground_truth)
466
+ duration_stats["Overall"] = {"total": total_ground_truth, "answered": answered, "correct": correct}
467
+
468
+ summary = {
469
+ "correct": correct,
470
+ "answered_predictions": answered,
471
+ "total_ground_truth": total_ground_truth,
472
+ "official_accuracy_pct": safe_pct(correct, total_ground_truth),
473
+ "answered_accuracy_pct": safe_pct(correct, answered),
474
+ "coverage_pct": safe_pct(answered, total_ground_truth),
475
+ "matched_prediction_ids": len(pred_ids & gt_ids),
476
+ "missing_prediction_ids": total_ground_truth - len(pred_ids & gt_ids),
477
+ "extra_prediction_ids": len(pred_ids - gt_ids),
478
+ "duplicate_prediction_ids": duplicate_prediction_ids,
479
+ }
480
+
481
+ domain_df = make_breakdown_dataframe(domain_stats, "Domain", ordered_labels=DOMAINS_ORDER)
482
+ duration_df = make_breakdown_dataframe(
483
+ duration_stats,
484
+ "Duration Bucket",
485
+ ordered_labels=DURATION_BUCKET_ORDER,
486
+ )
487
+ skill_df = make_breakdown_dataframe(skill_stats, "Skill")
488
+
489
+ status_markdown = (
490
+ "### Evaluation complete\n"
491
+ f"- Parsed file format: `{file_format}`\n"
492
+ f"- Uploaded rows: `{len(records)}`\n"
493
+ f"- Skipped empty answers: `{skipped_empty_answers}`\n"
494
+ f"- Evaluation time: `{time.time() - started_at:.2f}s`"
495
+ )
496
+ return (
497
+ status_markdown,
498
+ build_metrics_markdown(summary),
499
+ build_summary_markdown(domain_df, duration_df, skill_df),
500
+ domain_df,
501
+ duration_df,
502
+ skill_df,
503
+ )
504
+
505
+ except Exception as exc:
506
+ return empty_result(f"### Evaluation failed\n`{type(exc).__name__}: {exc}`")
507
+
508
+
509
+ def clear_outputs() -> tuple[None, str, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
510
+ return None, READY_STATUS_MARKDOWN, "", EMPTY_SUMMARY_MARKDOWN, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
511
+
512
+
513
+ with gr.Blocks(title="MMOU Evaluator", fill_width=False) as demo:
514
+ gr.Markdown(APP_INTRO)
515
+
516
+ prediction_file = gr.File(label="Upload prediction file", file_types=[".json", ".jsonl"], type="filepath")
517
+
518
+ with gr.Row():
519
+ evaluate_button = gr.Button("Evaluate", variant="primary")
520
+ clear_button = gr.Button("Clear")
521
+
522
+ status_markdown = gr.Markdown(READY_STATUS_MARKDOWN)
523
+ metrics_markdown = gr.Markdown("")
524
+ summary_markdown = gr.Markdown(EMPTY_SUMMARY_MARKDOWN)
525
+ gr.Markdown(FORMAT_GUIDE)
526
+
527
+ with gr.Tabs():
528
+ with gr.Tab("Domain Breakdown"):
529
+ domain_dataframe = gr.Dataframe(label="Domain breakdown", interactive=False, wrap=True)
530
+ with gr.Tab("Duration Breakdown"):
531
+ duration_dataframe = gr.Dataframe(label="Duration breakdown", interactive=False, wrap=True)
532
+ with gr.Tab("Skill Breakdown"):
533
+ skill_dataframe = gr.Dataframe(label="Skill breakdown", interactive=False, wrap=True)
534
+
535
+ evaluate_button.click(
536
+ fn=evaluate_submission,
537
+ inputs=[prediction_file],
538
+ outputs=[
539
+ status_markdown,
540
+ metrics_markdown,
541
+ summary_markdown,
542
+ domain_dataframe,
543
+ duration_dataframe,
544
+ skill_dataframe,
545
+ ],
546
+ )
547
+ clear_button.click(
548
+ fn=clear_outputs,
549
+ outputs=[
550
+ prediction_file,
551
+ status_markdown,
552
+ metrics_markdown,
553
+ summary_markdown,
554
+ domain_dataframe,
555
+ duration_dataframe,
556
+ skill_dataframe,
557
+ ],
558
+ )
559
+
560
+
561
+ if __name__ == "__main__":
562
+ demo.launch(theme=gr.themes.Default(), css=LAYOUT_CSS)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=6.0.0
2
+ pandas>=2.2.0
3
+ huggingface_hub>=0.30.0