rafmacalaba commited on
Commit
4447e87
Β·
1 Parent(s): 6849766

add instructions

Browse files
Files changed (1) hide show
  1. app.py +132 -52
app.py CHANGED
@@ -461,70 +461,150 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
461
  gr.HTML('<button id="theme_toggle" onclick="toggleDarkMode()">πŸŒ™ Dark Mode</button>')
462
 
463
  gr.Markdown("# Dataset Annotation Tool")
464
- gr.Markdown("Review and annotate dataset mentions. Each annotation is saved in real-time.")
465
-
466
- with gr.Row():
467
- with gr.Column(scale=2):
468
- dataset_name = gr.Textbox(label="Dataset Name", interactive=False, max_lines=2)
469
- context_box = gr.HighlightedText(
470
- label="Context (Β±1 sentence, dataset highlighted)",
471
- color_map={"DATASET": "yellow"},
472
- show_legend=False,
473
- combine_adjacent=True
474
- )
475
- metadata_box = gr.Markdown(label="Metadata")
476
 
477
- show_ai_checkbox = gr.Checkbox(label="πŸ€– Show what the AI thinks", value=False)
478
- ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False)
479
-
480
- with gr.Column(scale=1):
481
- progress_box = gr.Textbox(label="Progress", interactive=False, lines=1)
482
- chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1)
483
- dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1)
484
- status_box = gr.Textbox(label="Status", interactive=False, lines=1)
485
 
486
- notes_box = gr.Textbox(
487
- label="Notes (optional)",
488
- placeholder="Add any comments about this dataset...",
489
- lines=3
490
- )
491
 
492
- with gr.Row():
493
- accept_btn = gr.Button("βœ“ DATASET", variant="primary", size="lg", elem_id="accept_btn")
494
- reject_btn = gr.Button("βœ— NOT A DATASET", variant="stop", size="lg")
495
 
496
- gr.Markdown("---")
497
 
498
- with gr.Row():
499
- prev_btn = gr.Button("← Previous", size="sm")
500
- next_btn = gr.Button("Next β†’", size="sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
 
502
- skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm")
503
 
504
- gr.Markdown("---")
505
 
506
- with gr.Accordion("πŸ“Š Live Statistics", open=True):
507
- stats_box = gr.Markdown()
 
508
 
509
- gr.Markdown("---")
510
 
511
- # Download button for manual backup
512
- download_btn = gr.DownloadButton(
513
- "πŸ’Ύ Download Annotations",
514
- value=str(annotator.output_file) if annotator.output_file.exists() else None,
515
- size="sm",
516
- variant="secondary"
517
- )
518
 
519
- # HF Datasets status
520
- if annotator.hf_enabled:
521
- gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
522
- else:
523
- gr.Markdown("⚠️ **Auto-backup disabled** (set HF_TOKEN secret to enable)")
524
 
525
- gr.Markdown("---")
526
- gr.Markdown(f"**Input:** `{Path(input_file).name}`")
527
- gr.Markdown(f"**Output:** `{annotator.output_file.name}`")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
 
529
  nav_state = gr.State({})
530
 
 
461
  gr.HTML('<button id="theme_toggle" onclick="toggleDarkMode()">πŸŒ™ Dark Mode</button>')
462
 
463
  gr.Markdown("# Dataset Annotation Tool")
464
+
465
+ with gr.Tabs():
466
+ # Tab 1: Introduction and Instructions
467
+ with gr.Tab("πŸ“– Introduction & Instructions"):
468
+ gr.Markdown("""
469
+ ## Welcome to the Dataset Annotation Tool
 
 
 
 
 
 
470
 
471
+ This tool helps validate dataset mentions extracted from UNHCR and ReliefWeb documents. Your annotations will improve the accuracy of our dataset extraction model.
 
 
 
 
 
 
 
472
 
473
+ ### What You'll Be Annotating
 
 
 
 
474
 
475
+ You'll review **candidate dataset mentions** that our AI model has identified in humanitarian documents. Your task is to determine whether each mention is:
476
+ - βœ… **A Dataset**: A collection of data that can be referenced, analyzed, or used (e.g., surveys, databases, statistical reports)
477
+ - ❌ **Not a Dataset**: A document title, framework, strategy, or general reference that doesn't represent actual data
478
 
479
+ ### About the Data
480
 
481
+ - **Source**: UNHCR and ReliefWeb PDF documents
482
+ - **Sampling**: Stratified sample across different mention types (named, descriptive, vague)
483
+ - **AI Models**:
484
+ - **Extraction Model**: Fine-tuned model that identified these mentions
485
+ - **Judge (GPT-5.2)**: LLM-based validator that reviewed the extractions
486
+
487
+ ### How to Annotate
488
+
489
+ 1. **Review the Mention**: Read the **Dataset Name** and examine the **Context** (highlighted in yellow)
490
+
491
+ 2. **Check Metadata**: Review document source, stratum, and geography information
492
+
493
+ 3. **Compare AI Predictions** (Optional): Toggle "πŸ€– Show what the AI thinks" to see model predictions
494
+
495
+ 4. **Make Your Decision**:
496
+ - Click **βœ“ DATASET** (green) if it's a valid dataset
497
+ - Click **βœ— NOT A DATASET** (red) if it's not a dataset
498
 
499
+ 5. **Add Notes** (Optional): Document your reasoning for ambiguous cases
500
 
501
+ 6. **Navigate**: Use Previous/Next buttons or skip to unannotated samples
502
 
503
+ 7. **Save Progress**:
504
+ - Click **πŸ’Ύ Download Annotations** to backup locally
505
+ - Auto-backup to HF Datasets (if configured)
506
 
507
+ ### What Makes Something a Dataset?
508
 
509
+ βœ… **IS a Dataset:**
510
+ - Survey data (e.g., "UNHCR Household Survey 2023")
511
+ - Statistical databases (e.g., "Population Statistics Database")
512
+ - Assessment results with data (e.g., "Needs Assessment 2024" when cited as data source)
513
+ - Index datasets (e.g., "Multidimensional Poverty Index")
514
+ - Monitoring data (e.g., "Protection Monitoring Data")
 
515
 
516
+ ❌ **NOT a Dataset:**
517
+ - Report titles (e.g., "Global Trends Report 2024" as a publication)
518
+ - Frameworks/strategies (e.g., "Global Compact on Refugees")
519
+ - Assessment activities (e.g., "Rapid Assessment" as the activity itself)
520
+ - General document references
521
 
522
+ ### Tips for Accuracy
523
+
524
+ - **Context is key**: The same term can be a dataset or not depending on usage
525
+ - **Look for data indicators**: Numbers, statistics, "based on", "source:", "data from"
526
+ - **When in doubt**: Add a note explaining your reasoning
527
+ - **Be consistent**: Use the same criteria throughout your annotation session
528
+
529
+ ### Your Impact
530
+
531
+ Your annotations will:
532
+ - Improve model precision and recall
533
+ - Help identify patterns in false positives/negatives
534
+ - Create training data for the next model version
535
+ - Support better dataset discovery in humanitarian documents
536
+
537
+ ---
538
+
539
+ **Ready to start?** Click the **"Annotate"** tab above to begin!
540
+ """)
541
+
542
+ # Tab 2: Annotation Interface
543
+ with gr.Tab("✏️ Annotate"):
544
+ gr.Markdown("Review and annotate dataset mentions. Each annotation is saved in real-time.")
545
+
546
+ with gr.Row():
547
+ with gr.Column(scale=2):
548
+ dataset_name = gr.Textbox(label="Dataset Name", interactive=False, max_lines=2)
549
+ context_box = gr.HighlightedText(
550
+ label="Context (Β±1 sentence, dataset highlighted)",
551
+ color_map={"DATASET": "yellow"},
552
+ show_legend=False,
553
+ combine_adjacent=True
554
+ )
555
+ metadata_box = gr.Markdown(label="Metadata")
556
+
557
+ show_ai_checkbox = gr.Checkbox(label="πŸ€– Show what the AI thinks", value=False)
558
+ ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False)
559
+
560
+ with gr.Column(scale=1):
561
+ progress_box = gr.Textbox(label="Progress", interactive=False, lines=1)
562
+ chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1)
563
+ dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1)
564
+ status_box = gr.Textbox(label="Status", interactive=False, lines=1)
565
+
566
+ notes_box = gr.Textbox(
567
+ label="Notes (optional)",
568
+ placeholder="Add any comments about this dataset...",
569
+ lines=3
570
+ )
571
+
572
+ with gr.Row():
573
+ accept_btn = gr.Button("βœ“ DATASET", variant="primary", size="lg", elem_id="accept_btn")
574
+ reject_btn = gr.Button("βœ— NOT A DATASET", variant="stop", size="lg")
575
+
576
+ gr.Markdown("---")
577
+
578
+ with gr.Row():
579
+ prev_btn = gr.Button("← Previous", size="sm")
580
+ next_btn = gr.Button("Next β†’", size="sm")
581
+
582
+ skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm")
583
+
584
+ gr.Markdown("---")
585
+
586
+ with gr.Accordion("πŸ“Š Live Statistics", open=True):
587
+ stats_box = gr.Markdown()
588
+
589
+ gr.Markdown("---")
590
+
591
+ # Download button for manual backup
592
+ download_btn = gr.DownloadButton(
593
+ "πŸ’Ύ Download Annotations",
594
+ value=str(annotator.output_file) if annotator.output_file.exists() else None,
595
+ size="sm",
596
+ variant="secondary"
597
+ )
598
+
599
+ # HF Datasets status
600
+ if annotator.hf_enabled:
601
+ gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
602
+ else:
603
+ gr.Markdown("⚠️ **Auto-backup disabled** (set HF_TOKEN secret to enable)")
604
+
605
+ gr.Markdown("---")
606
+ gr.Markdown(f"**Input:** `{Path(input_file).name}`")
607
+ gr.Markdown(f"**Output:** `{annotator.output_file.name}`")
608
 
609
  nav_state = gr.State({})
610