Spaces:

ai4data
/

reliefweb_annotation

Sleeping

App Files Files Community

rafmacalaba commited on Jan 5

Commit

4447e87

1 Parent(s): 6849766

add instructions

Browse files

Files changed (1) hide show

app.py +132 -52

app.py CHANGED Viewed

@@ -461,70 +461,150 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
         gr.HTML('<button id="theme_toggle" onclick="toggleDarkMode()">🌙 Dark Mode</button>')
         gr.Markdown("# Dataset Annotation Tool")
-        gr.Markdown("Review and annotate dataset mentions. Each annotation is saved in real-time.")
-        with gr.Row():
-            with gr.Column(scale=2):
-                dataset_name = gr.Textbox(label="Dataset Name", interactive=False, max_lines=2)
-                context_box = gr.HighlightedText(
-                    label="Context (±1 sentence, dataset highlighted)",
-                    color_map={"DATASET": "yellow"},
-                    show_legend=False,
-                    combine_adjacent=True
-                )
-                metadata_box = gr.Markdown(label="Metadata")
-                show_ai_checkbox = gr.Checkbox(label="🤖 Show what the AI thinks", value=False)
-                ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False)
-            with gr.Column(scale=1):
-                progress_box = gr.Textbox(label="Progress", interactive=False, lines=1)
-                chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1)
-                dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1)
-                status_box = gr.Textbox(label="Status", interactive=False, lines=1)
-                notes_box = gr.Textbox(
-                    label="Notes (optional)",
-                    placeholder="Add any comments about this dataset...",
-                    lines=3
-                )
-                with gr.Row():
-                    accept_btn = gr.Button("✓ DATASET", variant="primary", size="lg", elem_id="accept_btn")
-                    reject_btn = gr.Button("✗ NOT A DATASET", variant="stop", size="lg")
-                gr.Markdown("---")
-                with gr.Row():
-                    prev_btn = gr.Button("← Previous", size="sm")
-                    next_btn = gr.Button("Next →", size="sm")
-                skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm")
-                gr.Markdown("---")
-                with gr.Accordion("📊 Live Statistics", open=True):
-                    stats_box = gr.Markdown()
-                gr.Markdown("---")
-                # Download button for manual backup
-                download_btn = gr.DownloadButton(
-                    "💾 Download Annotations",
-                    value=str(annotator.output_file) if annotator.output_file.exists() else None,
-                    size="sm",
-                    variant="secondary"
-                )
-                # HF Datasets status
-                if annotator.hf_enabled:
-                    gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
-                else:
-                    gr.Markdown("⚠️ **Auto-backup disabled** (set HF_TOKEN secret to enable)")
-                gr.Markdown("---")
-                gr.Markdown(f"**Input:** `{Path(input_file).name}`")
-                gr.Markdown(f"**Output:** `{annotator.output_file.name}`")
         nav_state = gr.State({})

         gr.HTML('<button id="theme_toggle" onclick="toggleDarkMode()">🌙 Dark Mode</button>')
         gr.Markdown("# Dataset Annotation Tool")
+        with gr.Tabs():
+            # Tab 1: Introduction and Instructions
+            with gr.Tab("📖 Introduction & Instructions"):
+                gr.Markdown("""
+                ## Welcome to the Dataset Annotation Tool
+                This tool helps validate dataset mentions extracted from UNHCR and ReliefWeb documents. Your annotations will improve the accuracy of our dataset extraction model.
+                ### What You'll Be Annotating
+                You'll review **candidate dataset mentions** that our AI model has identified in humanitarian documents. Your task is to determine whether each mention is:
+                - ✅ **A Dataset**: A collection of data that can be referenced, analyzed, or used (e.g., surveys, databases, statistical reports)
+                - ❌ **Not a Dataset**: A document title, framework, strategy, or general reference that doesn't represent actual data
+                ### About the Data
+                - **Source**: UNHCR and ReliefWeb PDF documents
+                - **Sampling**: Stratified sample across different mention types (named, descriptive, vague)
+                - **AI Models**:
+                  - **Extraction Model**: Fine-tuned model that identified these mentions
+                  - **Judge (GPT-5.2)**: LLM-based validator that reviewed the extractions
+                ### How to Annotate
+                1. **Review the Mention**: Read the **Dataset Name** and examine the **Context** (highlighted in yellow)
+                2. **Check Metadata**: Review document source, stratum, and geography information
+                3. **Compare AI Predictions** (Optional): Toggle "🤖 Show what the AI thinks" to see model predictions
+                4. **Make Your Decision**:
+                   - Click **✓ DATASET** (green) if it's a valid dataset
+                   - Click **✗ NOT A DATASET** (red) if it's not a dataset
+                5. **Add Notes** (Optional): Document your reasoning for ambiguous cases
+                6. **Navigate**: Use Previous/Next buttons or skip to unannotated samples
+                7. **Save Progress**:
+                   - Click **💾 Download Annotations** to backup locally
+                   - Auto-backup to HF Datasets (if configured)
+                ### What Makes Something a Dataset?
+                ✅ **IS a Dataset:**
+                - Survey data (e.g., "UNHCR Household Survey 2023")
+                - Statistical databases (e.g., "Population Statistics Database")
+                - Assessment results with data (e.g., "Needs Assessment 2024" when cited as data source)
+                - Index datasets (e.g., "Multidimensional Poverty Index")
+                - Monitoring data (e.g., "Protection Monitoring Data")
+                ❌ **NOT a Dataset:**
+                - Report titles (e.g., "Global Trends Report 2024" as a publication)
+                - Frameworks/strategies (e.g., "Global Compact on Refugees")
+                - Assessment activities (e.g., "Rapid Assessment" as the activity itself)
+                - General document references
+                ### Tips for Accuracy
+                - **Context is key**: The same term can be a dataset or not depending on usage
+                - **Look for data indicators**: Numbers, statistics, "based on", "source:", "data from"
+                - **When in doubt**: Add a note explaining your reasoning
+                - **Be consistent**: Use the same criteria throughout your annotation session
+                ### Your Impact
+                Your annotations will:
+                - Improve model precision and recall
+                - Help identify patterns in false positives/negatives
+                - Create training data for the next model version
+                - Support better dataset discovery in humanitarian documents
+                ---
+                **Ready to start?** Click the **"Annotate"** tab above to begin!
+                """)
+            # Tab 2: Annotation Interface
+            with gr.Tab("✏️ Annotate"):
+                gr.Markdown("Review and annotate dataset mentions. Each annotation is saved in real-time.")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        dataset_name = gr.Textbox(label="Dataset Name", interactive=False, max_lines=2)
+                        context_box = gr.HighlightedText(
+                            label="Context (±1 sentence, dataset highlighted)",
+                            color_map={"DATASET": "yellow"},
+                            show_legend=False,
+                            combine_adjacent=True
+                        )
+                        metadata_box = gr.Markdown(label="Metadata")
+                        show_ai_checkbox = gr.Checkbox(label="🤖 Show what the AI thinks", value=False)
+                        ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False)
+                    with gr.Column(scale=1):
+                        progress_box = gr.Textbox(label="Progress", interactive=False, lines=1)
+                        chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1)
+                        dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1)
+                        status_box = gr.Textbox(label="Status", interactive=False, lines=1)
+                        notes_box = gr.Textbox(
+                            label="Notes (optional)",
+                            placeholder="Add any comments about this dataset...",
+                            lines=3
+                        )
+                        with gr.Row():
+                            accept_btn = gr.Button("✓ DATASET", variant="primary", size="lg", elem_id="accept_btn")
+                            reject_btn = gr.Button("✗ NOT A DATASET", variant="stop", size="lg")
+                        gr.Markdown("---")
+                        with gr.Row():
+                            prev_btn = gr.Button("← Previous", size="sm")
+                            next_btn = gr.Button("Next →", size="sm")
+                        skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm")
+                        gr.Markdown("---")
+                        with gr.Accordion("📊 Live Statistics", open=True):
+                            stats_box = gr.Markdown()
+                        gr.Markdown("---")
+                        # Download button for manual backup
+                        download_btn = gr.DownloadButton(
+                            "💾 Download Annotations",
+                            value=str(annotator.output_file) if annotator.output_file.exists() else None,
+                            size="sm",
+                            variant="secondary"
+                        )
+                        # HF Datasets status
+                        if annotator.hf_enabled:
+                            gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
+                        else:
+                            gr.Markdown("⚠️ **Auto-backup disabled** (set HF_TOKEN secret to enable)")
+                        gr.Markdown("---")
+                        gr.Markdown(f"**Input:** `{Path(input_file).name}`")
+                        gr.Markdown(f"**Output:** `{annotator.output_file.name}`")
         nav_state = gr.State({})