Spaces:
Sleeping
Sleeping
Commit
Β·
4447e87
1
Parent(s):
6849766
add instructions
Browse files
app.py
CHANGED
|
@@ -461,70 +461,150 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
|
|
| 461 |
gr.HTML('<button id="theme_toggle" onclick="toggleDarkMode()">π Dark Mode</button>')
|
| 462 |
|
| 463 |
gr.Markdown("# Dataset Annotation Tool")
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
with gr.
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
label="Context (Β±1 sentence, dataset highlighted)",
|
| 471 |
-
color_map={"DATASET": "yellow"},
|
| 472 |
-
show_legend=False,
|
| 473 |
-
combine_adjacent=True
|
| 474 |
-
)
|
| 475 |
-
metadata_box = gr.Markdown(label="Metadata")
|
| 476 |
|
| 477 |
-
|
| 478 |
-
ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False)
|
| 479 |
-
|
| 480 |
-
with gr.Column(scale=1):
|
| 481 |
-
progress_box = gr.Textbox(label="Progress", interactive=False, lines=1)
|
| 482 |
-
chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1)
|
| 483 |
-
dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1)
|
| 484 |
-
status_box = gr.Textbox(label="Status", interactive=False, lines=1)
|
| 485 |
|
| 486 |
-
|
| 487 |
-
label="Notes (optional)",
|
| 488 |
-
placeholder="Add any comments about this dataset...",
|
| 489 |
-
lines=3
|
| 490 |
-
)
|
| 491 |
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
|
| 496 |
-
|
| 497 |
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
-
|
| 503 |
|
| 504 |
-
|
| 505 |
|
| 506 |
-
|
| 507 |
-
|
|
|
|
| 508 |
|
| 509 |
-
|
| 510 |
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
)
|
| 518 |
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
|
| 529 |
nav_state = gr.State({})
|
| 530 |
|
|
|
|
| 461 |
gr.HTML('<button id="theme_toggle" onclick="toggleDarkMode()">π Dark Mode</button>')
|
| 462 |
|
| 463 |
gr.Markdown("# Dataset Annotation Tool")
|
| 464 |
+
|
| 465 |
+
with gr.Tabs():
|
| 466 |
+
# Tab 1: Introduction and Instructions
|
| 467 |
+
with gr.Tab("π Introduction & Instructions"):
|
| 468 |
+
gr.Markdown("""
|
| 469 |
+
## Welcome to the Dataset Annotation Tool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
+
This tool helps validate dataset mentions extracted from UNHCR and ReliefWeb documents. Your annotations will improve the accuracy of our dataset extraction model.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
+
### What You'll Be Annotating
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
+
You'll review **candidate dataset mentions** that our AI model has identified in humanitarian documents. Your task is to determine whether each mention is:
|
| 476 |
+
- β
**A Dataset**: A collection of data that can be referenced, analyzed, or used (e.g., surveys, databases, statistical reports)
|
| 477 |
+
- β **Not a Dataset**: A document title, framework, strategy, or general reference that doesn't represent actual data
|
| 478 |
|
| 479 |
+
### About the Data
|
| 480 |
|
| 481 |
+
- **Source**: UNHCR and ReliefWeb PDF documents
|
| 482 |
+
- **Sampling**: Stratified sample across different mention types (named, descriptive, vague)
|
| 483 |
+
- **AI Models**:
|
| 484 |
+
- **Extraction Model**: Fine-tuned model that identified these mentions
|
| 485 |
+
- **Judge (GPT-5.2)**: LLM-based validator that reviewed the extractions
|
| 486 |
+
|
| 487 |
+
### How to Annotate
|
| 488 |
+
|
| 489 |
+
1. **Review the Mention**: Read the **Dataset Name** and examine the **Context** (highlighted in yellow)
|
| 490 |
+
|
| 491 |
+
2. **Check Metadata**: Review document source, stratum, and geography information
|
| 492 |
+
|
| 493 |
+
3. **Compare AI Predictions** (Optional): Toggle "π€ Show what the AI thinks" to see model predictions
|
| 494 |
+
|
| 495 |
+
4. **Make Your Decision**:
|
| 496 |
+
- Click **β DATASET** (green) if it's a valid dataset
|
| 497 |
+
- Click **β NOT A DATASET** (red) if it's not a dataset
|
| 498 |
|
| 499 |
+
5. **Add Notes** (Optional): Document your reasoning for ambiguous cases
|
| 500 |
|
| 501 |
+
6. **Navigate**: Use Previous/Next buttons or skip to unannotated samples
|
| 502 |
|
| 503 |
+
7. **Save Progress**:
|
| 504 |
+
- Click **πΎ Download Annotations** to backup locally
|
| 505 |
+
- Auto-backup to HF Datasets (if configured)
|
| 506 |
|
| 507 |
+
### What Makes Something a Dataset?
|
| 508 |
|
| 509 |
+
β
**IS a Dataset:**
|
| 510 |
+
- Survey data (e.g., "UNHCR Household Survey 2023")
|
| 511 |
+
- Statistical databases (e.g., "Population Statistics Database")
|
| 512 |
+
- Assessment results with data (e.g., "Needs Assessment 2024" when cited as data source)
|
| 513 |
+
- Index datasets (e.g., "Multidimensional Poverty Index")
|
| 514 |
+
- Monitoring data (e.g., "Protection Monitoring Data")
|
|
|
|
| 515 |
|
| 516 |
+
β **NOT a Dataset:**
|
| 517 |
+
- Report titles (e.g., "Global Trends Report 2024" as a publication)
|
| 518 |
+
- Frameworks/strategies (e.g., "Global Compact on Refugees")
|
| 519 |
+
- Assessment activities (e.g., "Rapid Assessment" as the activity itself)
|
| 520 |
+
- General document references
|
| 521 |
|
| 522 |
+
### Tips for Accuracy
|
| 523 |
+
|
| 524 |
+
- **Context is key**: The same term can be a dataset or not depending on usage
|
| 525 |
+
- **Look for data indicators**: Numbers, statistics, "based on", "source:", "data from"
|
| 526 |
+
- **When in doubt**: Add a note explaining your reasoning
|
| 527 |
+
- **Be consistent**: Use the same criteria throughout your annotation session
|
| 528 |
+
|
| 529 |
+
### Your Impact
|
| 530 |
+
|
| 531 |
+
Your annotations will:
|
| 532 |
+
- Improve model precision and recall
|
| 533 |
+
- Help identify patterns in false positives/negatives
|
| 534 |
+
- Create training data for the next model version
|
| 535 |
+
- Support better dataset discovery in humanitarian documents
|
| 536 |
+
|
| 537 |
+
---
|
| 538 |
+
|
| 539 |
+
**Ready to start?** Click the **"Annotate"** tab above to begin!
|
| 540 |
+
""")
|
| 541 |
+
|
| 542 |
+
# Tab 2: Annotation Interface
|
| 543 |
+
with gr.Tab("βοΈ Annotate"):
|
| 544 |
+
gr.Markdown("Review and annotate dataset mentions. Each annotation is saved in real-time.")
|
| 545 |
+
|
| 546 |
+
with gr.Row():
|
| 547 |
+
with gr.Column(scale=2):
|
| 548 |
+
dataset_name = gr.Textbox(label="Dataset Name", interactive=False, max_lines=2)
|
| 549 |
+
context_box = gr.HighlightedText(
|
| 550 |
+
label="Context (Β±1 sentence, dataset highlighted)",
|
| 551 |
+
color_map={"DATASET": "yellow"},
|
| 552 |
+
show_legend=False,
|
| 553 |
+
combine_adjacent=True
|
| 554 |
+
)
|
| 555 |
+
metadata_box = gr.Markdown(label="Metadata")
|
| 556 |
+
|
| 557 |
+
show_ai_checkbox = gr.Checkbox(label="π€ Show what the AI thinks", value=False)
|
| 558 |
+
ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False)
|
| 559 |
+
|
| 560 |
+
with gr.Column(scale=1):
|
| 561 |
+
progress_box = gr.Textbox(label="Progress", interactive=False, lines=1)
|
| 562 |
+
chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1)
|
| 563 |
+
dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1)
|
| 564 |
+
status_box = gr.Textbox(label="Status", interactive=False, lines=1)
|
| 565 |
+
|
| 566 |
+
notes_box = gr.Textbox(
|
| 567 |
+
label="Notes (optional)",
|
| 568 |
+
placeholder="Add any comments about this dataset...",
|
| 569 |
+
lines=3
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
with gr.Row():
|
| 573 |
+
accept_btn = gr.Button("β DATASET", variant="primary", size="lg", elem_id="accept_btn")
|
| 574 |
+
reject_btn = gr.Button("β NOT A DATASET", variant="stop", size="lg")
|
| 575 |
+
|
| 576 |
+
gr.Markdown("---")
|
| 577 |
+
|
| 578 |
+
with gr.Row():
|
| 579 |
+
prev_btn = gr.Button("β Previous", size="sm")
|
| 580 |
+
next_btn = gr.Button("Next β", size="sm")
|
| 581 |
+
|
| 582 |
+
skip_btn = gr.Button("βοΈ Skip to Next Unannotated", size="sm")
|
| 583 |
+
|
| 584 |
+
gr.Markdown("---")
|
| 585 |
+
|
| 586 |
+
with gr.Accordion("π Live Statistics", open=True):
|
| 587 |
+
stats_box = gr.Markdown()
|
| 588 |
+
|
| 589 |
+
gr.Markdown("---")
|
| 590 |
+
|
| 591 |
+
# Download button for manual backup
|
| 592 |
+
download_btn = gr.DownloadButton(
|
| 593 |
+
"πΎ Download Annotations",
|
| 594 |
+
value=str(annotator.output_file) if annotator.output_file.exists() else None,
|
| 595 |
+
size="sm",
|
| 596 |
+
variant="secondary"
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
# HF Datasets status
|
| 600 |
+
if annotator.hf_enabled:
|
| 601 |
+
gr.Markdown(f"βοΈ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
|
| 602 |
+
else:
|
| 603 |
+
gr.Markdown("β οΈ **Auto-backup disabled** (set HF_TOKEN secret to enable)")
|
| 604 |
+
|
| 605 |
+
gr.Markdown("---")
|
| 606 |
+
gr.Markdown(f"**Input:** `{Path(input_file).name}`")
|
| 607 |
+
gr.Markdown(f"**Output:** `{annotator.output_file.name}`")
|
| 608 |
|
| 609 |
nav_state = gr.State({})
|
| 610 |
|