Spaces:

ai4data
/

reliefweb_annotation

Running

App Files Files Community

rafmacalaba commited on 5 days ago

Commit

64b4b2f

1 Parent(s): b040870

add pdf viewer

Browse files

Files changed (2) hide show

app.py +252 -64
upload_pdfs.py +133 -0

app.py CHANGED Viewed

@@ -16,7 +16,17 @@ import gradio as gr
 import json
 import re
 import os
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
 from datetime import datetime
 from huggingface_hub import HfApi, login
@@ -31,13 +41,20 @@ class ValidationAnnotator:
     No 4o data available - only judge (GPT-5.2) verdicts are shown.
     """
-    def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None):
         self.input_file = Path(input_file)
         self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl"
         # HF Datasets integration
         self.hf_dataset_repo = hf_dataset_repo
         self.hf_token = hf_token or os.getenv("HF_TOKEN")
         self.hf_enabled = False
         # Try to enable HF Datasets if credentials provided
@@ -296,19 +313,16 @@ class ValidationAnnotator:
             # Show all records including siblings
             self.filtered_indices = list(range(len(self.records)))
         else:
-            # Filter by extraction_tag OR judge_tag matching the filter
             # AND exclude siblings (only show primary samples)
             self.filtered_indices = [
                 i for i, record in enumerate(self.records)
-                if (record.get('extraction_tag') == filter_value or record.get('judge_tag') == filter_value)
                 and record.get('is_primary', True)  # Only primary samples, not siblings
             ]
-        # Reset to first filtered record if current position is not in filtered set
-        if self.current_idx not in self.filtered_indices and self.filtered_indices:
-            self.current_idx = self.filtered_indices[0]
-        elif not self.filtered_indices:
-            self.current_idx = len(self.records)  # No matching records
     def _is_annotated(self, idx: int) -> bool:
         """Check if a record has been annotated."""
@@ -332,18 +346,28 @@ class ValidationAnnotator:
         return False
     def _find_next_unannotated(self):
-        """Find the next unannotated record (skipping one-word vague/descriptive)."""
-        for i in range(len(self.records)):
-            if not self._is_annotated(i) and not self._should_skip(i):
-                self.current_idx = i
                 return
-        # All annotated or skippable
-        self.current_idx = len(self.records)
-    def get_current_display(self) -> Tuple[str, list, str, str, str, str, Dict]:
         """Get current record for display."""
         if self.current_idx >= len(self.records):
-            return "🎉 All samples validated!", [], "", "", f"Progress: {len(self.annotations)}/{len(self.records)} (100%)", "✅ Complete", {}
         record = self.records[self.current_idx]
@@ -379,20 +403,26 @@ class ValidationAnnotator:
         if record.get('judge_data_type'):
             ai_verdicts_str += f"**Data Type:** {record['judge_data_type']}\n"
         if record.get('judge_reasoning'):
-            reasoning = record['judge_reasoning'][:300]
             ai_verdicts_str += f"\n*Reasoning:* {reasoning}..."
         # Metadata
         metadata_parts = []
-        metadata_parts.append(f"**Stratum:** `{record['stratum']}`")
-        metadata_parts.append(f"**Document:** `{record['document'][:50]}...`")
         is_primary = record.get('is_primary', True)
-        metadata_parts.append(f"**Type:** {'Primary sample' if is_primary else 'Sibling (same chunk)'}")
         if record.get('geography'):
             geo = record['geography']
             if isinstance(geo, dict):
                 geo = geo.get('text', str(geo))
-            metadata_parts.append(f"**Geography:** {geo}")
         metadata_str = "\n".join(metadata_parts)
         # Get chunk info
@@ -419,7 +449,38 @@ class ValidationAnnotator:
             'can_next': self.current_idx < self.total_datasets - 1
         }
-        return record['text'], context, metadata_str, ai_verdicts_str, progress, status, nav
     def annotate(self, verdict: str, notes: str = "") -> Tuple[str, list, str, str, str, str]:
         """Annotate current record and move to next."""
@@ -431,14 +492,30 @@ class ValidationAnnotator:
         return self.get_current_display()[:6]
     def next_record(self):
-        """Move to next record."""
-        if self.current_idx < len(self.records) - 1:
-            self.current_idx += 1
     def prev_record(self):
-        """Move to previous record."""
-        if self.current_idx > 0:
-            self.current_idx -= 1
     def skip_to_next_unannotated(self):
         """Skip to next unannotated record (also skipping one-word vague/descriptive)."""
@@ -471,9 +548,10 @@ class ValidationAnnotator:
         return stats
-def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None):
     """Create and configure Gradio app."""
-    annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token)
     # Custom CSS for the green button and dark mode toggle
     css = """
@@ -520,6 +598,20 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
             const btn = document.getElementById('theme_toggle');
             if (btn) btn.textContent = '☀️ Light Mode';
         }
     });
     """
@@ -606,25 +698,35 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
                 **Ready to start?** Click the **"Annotate"** tab above to begin!
                 """)
             # Tab 2: Annotation Interface
             with gr.Tab("✏️ Annotate") as annotate_tab:
-                gr.Markdown("Review and annotate dataset mentions. Each annotation is saved in real-time.")
                 with gr.Row():
-                    with gr.Column(scale=2):
-                        dataset_name = gr.Textbox(label="Dataset Name", interactive=False, max_lines=2)
                         context_box = gr.HighlightedText(
                             label="Context (±2 sentences, dataset highlighted)",
                             color_map={"DATASET": "yellow"},
                             show_legend=False,
                             combine_adjacent=True
                         )
-                        metadata_box = gr.Markdown(label="Metadata")
                         show_ai_checkbox = gr.Checkbox(label="🤖 Show what the AI thinks", value=False)
-                        ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False)
-                    with gr.Column(scale=1):
                         # Filter dropdown
                         filter_dropdown = gr.Dropdown(
                             choices=["All", "named", "descriptive", "vague", "non-dataset"],
@@ -633,10 +735,10 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
                             interactive=True
                         )
-                        progress_box = gr.Textbox(label="Progress", interactive=False, lines=1)
-                        chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1)
-                        dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1)
-                        status_box = gr.Textbox(label="Status", interactive=False, lines=1)
                         notes_box = gr.Textbox(
                             label="Notes (optional)",
@@ -656,12 +758,8 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
                         skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm")
-                        gr.Markdown("---")
-                        with gr.Accordion("📊 Live Statistics", open=True):
-                            stats_box = gr.Markdown()
-                        gr.Markdown("---")
                         # Download button for manual backup
                         download_btn = gr.DownloadButton(
@@ -675,36 +773,72 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
                         if annotator.hf_enabled:
                             gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
                         else:
-                            gr.Markdown("⚠️ **Auto-backup disabled** (set HF_TOKEN secret to enable)")
-                        gr.Markdown("---")
                         gr.Markdown(f"**Input:** `{Path(input_file).name}`")
-                        gr.Markdown(f"**Output:** `{annotator.output_file.name}`")
         nav_state = gr.State({})
         def update_display():
-            name, context, metadata, ai_verdicts, progress, status, nav = annotator.get_current_display()
             chunk_info = nav.get('chunk_info', '')
             dataset_in_chunk = nav.get('dataset_in_chunk', '')
             stats = annotator.get_statistics()
-            return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats
         def accept_and_next(notes):
             name, context, metadata, ai_verdicts, progress, status = annotator.annotate('dataset', notes)
-            _, _, _, _, _, _, nav = annotator.get_current_display()
             chunk_info = nav.get('chunk_info', '')
             dataset_in_chunk = nav.get('dataset_in_chunk', '')
             stats = annotator.get_statistics()
-            return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats
         def reject_and_next(notes):
             name, context, metadata, ai_verdicts, progress, status = annotator.annotate('non-dataset', notes)
-            _, _, _, _, _, _, nav = annotator.get_current_display()
             chunk_info = nav.get('chunk_info', '')
             dataset_in_chunk = nav.get('dataset_in_chunk', '')
             stats = annotator.get_statistics()
-            return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats
         def go_next():
             annotator.next_record()
@@ -721,7 +855,8 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
         def toggle_ai_verdicts(show_ai):
             if show_ai:
                 # Get current AI verdicts content
-                _, _, _, ai_verdicts, _, _, _ = annotator.get_current_display()
                 return gr.update(visible=True, value=ai_verdicts)
             return gr.update(visible=False)
@@ -732,8 +867,11 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
             return None
         # Outputs - updated with chunk_info and dataset_in_chunk
-        outputs_list = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, nav_state, stats_box]
-        outputs_annotate = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, notes_box, nav_state, stats_box]
         accept_btn.click(accept_and_next, inputs=[notes_box], outputs=outputs_annotate).then(
             get_download_file, outputs=[download_btn]
@@ -752,19 +890,44 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
         filter_dropdown.change(apply_filter, inputs=[filter_dropdown], outputs=outputs_list)
         show_ai_checkbox.change(toggle_ai_verdicts, inputs=[show_ai_checkbox], outputs=[ai_verdicts_box])
-        # Load data when app starts AND when tab is selected
-        app.load(update_display, outputs=outputs_list)
         annotate_tab.select(update_display, outputs=outputs_list)
     return app
 # For Hugging Face Spaces deployment
 if __name__ == "__main__":
-    # Use the data file in the repository
-    input_file = "validation_sample_filtering_retained.jsonl"
     # Check if file exists
     if not Path(input_file).exists():
         raise FileNotFoundError(
             f"Input file '{input_file}' not found. "
@@ -775,6 +938,31 @@ if __name__ == "__main__":
     hf_dataset_repo = os.getenv("HF_DATASET_REPO")  # e.g., "username/reliefweb-annotations"
     hf_token = os.getenv("HF_TOKEN")  # HF write token
     # Create and launch the app
-    app = create_app(input_file, hf_dataset_repo, hf_token)
-    app.launch()

 import json
 import re
 import os
+import argparse
 from pathlib import Path
+from dotenv import load_dotenv
+# Load .env for local development
+load_dotenv()
+try:
+    from gradio_pdf import PDF as gr_pdf
+except ImportError:
+    gr_pdf = None
 from typing import Dict, List, Tuple, Optional
 from datetime import datetime
 from huggingface_hub import HfApi, login
     No 4o data available - only judge (GPT-5.2) verdicts are shown.
     """
+    def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
+                 pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None):
         self.input_file = Path(input_file)
         self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl"
         # HF Datasets integration
         self.hf_dataset_repo = hf_dataset_repo
         self.hf_token = hf_token or os.getenv("HF_TOKEN")
+        # PDF configuration
+        self.pdf_dir = Path(pdf_dir) if pdf_dir else None
+        self.pdf_url_base = pdf_url_base
+        if self.pdf_dir and not self.pdf_dir.exists():
+            print(f"⚠️ PDF directory not found: {self.pdf_dir}")
         self.hf_enabled = False
         # Try to enable HF Datasets if credentials provided
             # Show all records including siblings
             self.filtered_indices = list(range(len(self.records)))
         else:
+            # Filter by extraction_tag only (not judge_tag)
             # AND exclude siblings (only show primary samples)
             self.filtered_indices = [
                 i for i, record in enumerate(self.records)
+                if record.get('extraction_tag') == filter_value
                 and record.get('is_primary', True)  # Only primary samples, not siblings
             ]
+        # Always jump to first unannotated record in the new filtered set for determinism
+        self._find_next_unannotated()
     def _is_annotated(self, idx: int) -> bool:
         """Check if a record has been annotated."""
         return False
     def _find_next_unannotated(self):
+        """Find the next unannotated record within the current filtered set."""
+        if not self.filtered_indices:
+            self.current_idx = len(self.records)
+            return
+        for idx in self.filtered_indices:
+            if not self._is_annotated(idx) and not self._should_skip(idx):
+                self.current_idx = idx
                 return
+        # All filtered records are annotated or skippable, go to the first filtered one if we have any
+        # or stick to the end if we want to show the completion screen.
+        # Actually, let's go to the last filtered one if all are annotated.
+        if self.filtered_indices:
+            self.current_idx = self.filtered_indices[0]
+        else:
+            self.current_idx = len(self.records)
+    def get_current_display(self) -> Tuple[str, list, str, str, str, str, Dict, str]:
         """Get current record for display."""
         if self.current_idx >= len(self.records):
+            return "🎉 All samples validated!", [], "", "", f"Progress: {len(self.annotations)}/{len(self.records)} (100%)", "✅ Complete", {}, ""
         record = self.records[self.current_idx]
         if record.get('judge_data_type'):
             ai_verdicts_str += f"**Data Type:** {record['judge_data_type']}\n"
         if record.get('judge_reasoning'):
+            reasoning = record['judge_reasoning']
             ai_verdicts_str += f"\n*Reasoning:* {reasoning}..."
+        # Metadata
         # Metadata
         metadata_parts = []
+        metadata_parts.append(f"- **Stratum:** `{record['stratum']}`")
+        # metadata_parts.append(f"- **Document:** `{record['document']}...`")
+        if record.get("source_document"):
+            metadata_parts.append(f"- **Source File:** `{record.get('source_document')}`")
+        if record.get("page_number"):
+            metadata_parts.append(f"- **Page(s):** {record.get('page_number')}")
         is_primary = record.get('is_primary', True)
+        metadata_parts.append(f"- **Type:** {'Primary sample' if is_primary else 'Sibling (same chunk)'}")
         if record.get('geography'):
             geo = record['geography']
             if isinstance(geo, dict):
                 geo = geo.get('text', str(geo))
+            metadata_parts.append(f"- **Geography:** {geo}")
         metadata_str = "\n".join(metadata_parts)
         # Get chunk info
             'can_next': self.current_idx < self.total_datasets - 1
         }
+        # PDF Source path and page
+        source_doc = record.get("source_document")
+        page_num = record.get("page_number")
+        pdf_value = None
+        # Convert page_num to int and add 1 (offset from 0-indexed data)
+        try:
+            if page_num:
+                page_num = int(page_num) + 1
+            else:
+                page_num = 1
+        except (ValueError, TypeError):
+            page_num = 1
+        if source_doc and self.pdf_dir:
+            # Local PDF directory
+            pdf_path = self.pdf_dir / source_doc
+            if pdf_path.exists():
+                pdf_value = str(pdf_path.absolute())
+                print(f"📄 Found PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True)
+            else:
+                print(f"⚠️ PDF file not found: {pdf_path}", flush=True)
+        elif source_doc and self.pdf_url_base:
+            # Remote PDF via URL (e.g., HF Datasets)
+            # Remove any leading slashes from source_doc
+            source_doc_clean = source_doc.lstrip('/')
+            pdf_value = f"{self.pdf_url_base.rstrip('/')}/{source_doc_clean}"
+            print(f"🌐 Using remote PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True)
+        elif source_doc:
+            print(f"ℹ️ PDF source specified ({source_doc}) but no pdf_dir or pdf_url_base provided.", flush=True)
+        return record['text'], context, metadata_str, ai_verdicts_str, progress, status, nav, pdf_value, page_num
     def annotate(self, verdict: str, notes: str = "") -> Tuple[str, list, str, str, str, str]:
         """Annotate current record and move to next."""
         return self.get_current_display()[:6]
     def next_record(self):
+        """Move to next record in the filtered set."""
+        if not self.filtered_indices:
+            return
+        try:
+            current_pos = self.filtered_indices.index(self.current_idx)
+            if current_pos < len(self.filtered_indices) - 1:
+                self.current_idx = self.filtered_indices[current_pos + 1]
+        except ValueError:
+            # Current idx not in filtered set (maybe filter changed), jump to first
+            self.current_idx = self.filtered_indices[0]
     def prev_record(self):
+        """Move to previous record in the filtered set."""
+        if not self.filtered_indices:
+            return
+        try:
+            current_pos = self.filtered_indices.index(self.current_idx)
+            if current_pos > 0:
+                self.current_idx = self.filtered_indices[current_pos - 1]
+        except ValueError:
+            # Current idx not in filtered set, jump to first
+            self.current_idx = self.filtered_indices[0]
     def skip_to_next_unannotated(self):
         """Skip to next unannotated record (also skipping one-word vague/descriptive)."""
         return stats
+def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
+               pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None):
     """Create and configure Gradio app."""
+    annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base)
     # Custom CSS for the green button and dark mode toggle
     css = """
             const btn = document.getElementById('theme_toggle');
             if (btn) btn.textContent = '☀️ Light Mode';
         }
+        // Force resize when switching to Annotate tab to help PDF viewer
+        document.body.addEventListener('click', function(e) {
+            if (e.target && e.target.innerText && e.target.innerText.includes('Annotate')) {
+                console.log('Annotate tab clicked - forcing resize');
+                setTimeout(() => {
+                    window.dispatchEvent(new Event('resize'));
+                    // Also try to find any canvases and nudge them
+                    document.querySelectorAll('canvas').forEach(c => {
+                        c.dispatchEvent(new Event('resize'));
+                    });
+                }, 500);
+            }
+        }, true);
     });
     """
                 **Ready to start?** Click the **"Annotate"** tab above to begin!
                 """)
+            # Get initial values for robust first render
+            init_name, init_context, init_metadata, init_ai, init_progress, init_status, init_nav, init_pdf, init_page = annotator.get_current_display()
+            init_chunk_info = init_nav.get('chunk_info', '')
+            init_dataset_in_chunk = init_nav.get('dataset_in_chunk', '')
+            init_stats = annotator.get_statistics()
             # Tab 2: Annotation Interface
             with gr.Tab("✏️ Annotate") as annotate_tab:
+                gr.Markdown("Review and annotate dataset mentions. PDF viewer is below for reference.")
+                # Top Section: Annotation Controls
                 with gr.Row():
+                    # Dataset Info & Context
+                    with gr.Column(scale=3):
+                        dataset_name = gr.Textbox(label="Dataset Name", value=init_name, interactive=False, max_lines=2)
                         context_box = gr.HighlightedText(
                             label="Context (±2 sentences, dataset highlighted)",
+                            value=init_context,
                             color_map={"DATASET": "yellow"},
                             show_legend=False,
                             combine_adjacent=True
                         )
+                        metadata_box = gr.Markdown(init_metadata, label="Metadata")
                         show_ai_checkbox = gr.Checkbox(label="🤖 Show what the AI thinks", value=False)
+                        ai_verdicts_box = gr.Markdown(init_ai, label="AI Analysis", visible=False)
+                    # Controls & Progress
+                    with gr.Column(scale=2):
                         # Filter dropdown
                         filter_dropdown = gr.Dropdown(
                             choices=["All", "named", "descriptive", "vague", "non-dataset"],
                             interactive=True
                         )
+                        progress_box = gr.Textbox(label="Progress", value=init_progress, interactive=False, lines=1)
+                        chunk_info_box = gr.Textbox(label="Input Text Position", value=init_chunk_info, interactive=False, lines=1)
+                        dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", value=init_dataset_in_chunk, interactive=False, lines=1)
+                        status_box = gr.Textbox(label="Status", value=init_status, interactive=False, lines=1)
                         notes_box = gr.Textbox(
                             label="Notes (optional)",
                         skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm")
+                        with gr.Accordion("📊 Live Statistics", open=False):
+                            stats_box = gr.Markdown(init_stats)
                         # Download button for manual backup
                         download_btn = gr.DownloadButton(
                         if annotator.hf_enabled:
                             gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
                         else:
+                            gr.Markdown("⚠️ **Auto-backup disabled**")
                         gr.Markdown(f"**Input:** `{Path(input_file).name}`")
+                gr.Markdown("---")
+                # Bottom Section: PDF Viewer (Full Width)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        if gr_pdf is None:
+                            gr.Markdown("### ⚠️ `gradio-pdf` not found\nPlease run `uv pip install gradio-pdf` and restart.")
+                            pdf_viewer = gr.HTML(visible=False)
+                        else:
+                            # Use gradio-pdf component
+                            pdf_viewer = gr_pdf(
+                                label="Source Document",
+                                height=1000,
+                                visible=True
+                            )
+                        refresh_pdf_btn = gr.Button("🔄 Reload PDF Viewer", size="sm")
+                        # Hidden PDF component to authorize file serving
+                        if annotator.pdf_dir:
+                             gr.File(value=None, visible=False, interactive=False)
         nav_state = gr.State({})
         def update_display():
+            print(f"📡 Updating display for index {annotator.current_idx}...", flush=True)
+            name, context, metadata, ai_verdicts, progress, status, nav, pdf_path, page_num = annotator.get_current_display()
             chunk_info = nav.get('chunk_info', '')
             dataset_in_chunk = nav.get('dataset_in_chunk', '')
             stats = annotator.get_statistics()
+            # Use gr.update for gradio_pdf component
+            pdf_update = gr.update(value=pdf_path, starting_page=page_num)
+            print(f"🖼️ PDF Update: path={pdf_path}, page={page_num}", flush=True)
+            return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats, pdf_update
         def accept_and_next(notes):
             name, context, metadata, ai_verdicts, progress, status = annotator.annotate('dataset', notes)
+            _, _, _, _, _, _, nav, pdf_value, page_num = annotator.get_current_display()
             chunk_info = nav.get('chunk_info', '')
             dataset_in_chunk = nav.get('dataset_in_chunk', '')
             stats = annotator.get_statistics()
+            # Use gr.update for gradio_pdf component
+            pdf_update = gr.update(value=pdf_value, starting_page=page_num)
+            return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats, pdf_update
         def reject_and_next(notes):
             name, context, metadata, ai_verdicts, progress, status = annotator.annotate('non-dataset', notes)
+            _, _, _, _, _, _, nav, pdf_value, page_num = annotator.get_current_display()
             chunk_info = nav.get('chunk_info', '')
             dataset_in_chunk = nav.get('dataset_in_chunk', '')
             stats = annotator.get_statistics()
+            # Use gr.update for gradio_pdf component
+            pdf_update = gr.update(value=pdf_value, starting_page=page_num)
+            return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats, pdf_update
         def go_next():
             annotator.next_record()
         def toggle_ai_verdicts(show_ai):
             if show_ai:
                 # Get current AI verdicts content
+                display_data = annotator.get_current_display()
+                ai_verdicts = display_data[3]  # ai_verdicts_str is the 4th value
                 return gr.update(visible=True, value=ai_verdicts)
             return gr.update(visible=False)
             return None
         # Outputs - updated with chunk_info and dataset_in_chunk
+        # Outputs - updated with chunk_info and dataset_in_chunk
+        outputs_list = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, nav_state, stats_box, pdf_viewer]
+        outputs_annotate = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, notes_box, nav_state, stats_box, pdf_viewer]
         accept_btn.click(accept_and_next, inputs=[notes_box], outputs=outputs_annotate).then(
             get_download_file, outputs=[download_btn]
         filter_dropdown.change(apply_filter, inputs=[filter_dropdown], outputs=outputs_list)
         show_ai_checkbox.change(toggle_ai_verdicts, inputs=[show_ai_checkbox], outputs=[ai_verdicts_box])
+        def initial_load_no_pdf():
+            """Initial load without PDF to avoid the blank page bug on first render.
+            The PDF will be loaded when the user first clicks the Annotate tab."""
+            print("🚀 Initial app load - PDF set to None (will load on tab select)", flush=True)
+            name, context, metadata, ai_verdicts, progress, status, nav, pdf_path, page_num = annotator.get_current_display()
+            chunk_info = nav.get('chunk_info', '')
+            dataset_in_chunk = nav.get('dataset_in_chunk', '')
+            stats = annotator.get_statistics()
+            # Return None for PDF to avoid initial render bug
+            pdf_update = gr.update(value=None)
+            return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats, pdf_update
+        # Load data when app starts - WITHOUT PDF to avoid blank page bug
+        app.load(initial_load_no_pdf, outputs=outputs_list)
+        # When Annotate tab is selected, load the PDF (this is the "second update" that triggers proper render)
         annotate_tab.select(update_display, outputs=outputs_list)
+        refresh_pdf_btn.click(update_display, outputs=outputs_list)
     return app
 # For Hugging Face Spaces deployment
 if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="Dataset Annotation Tool")
+    parser.add_argument("--input", "-i", type=str, default="validation_sample_filtering_retained.jsonl",
+                        help="Input JSONL file (default: validation_sample_filtering_retained.jsonl)")
+    parser.add_argument("--pdf-dir", "-p", type=str, default=None,
+                        help="Directory containing local PDF files (optional)")
+    parser.add_argument("--pdf-url-base", "-u", type=str, default=None,
+                        help="Base URL for remote PDFs (if not using local files)")
+    args = parser.parse_args()
     # Check if file exists
+    input_file = args.input
     if not Path(input_file).exists():
         raise FileNotFoundError(
             f"Input file '{input_file}' not found. "
     hf_dataset_repo = os.getenv("HF_DATASET_REPO")  # e.g., "username/reliefweb-annotations"
     hf_token = os.getenv("HF_TOKEN")  # HF write token
+    # Determine PDF source: command-line args take priority, then env vars
+    pdf_dir = args.pdf_dir
+    pdf_url_base = args.pdf_url_base
+    # If no explicit PDF source, check for HF PDF repo environment variable
+    if not pdf_dir and not pdf_url_base:
+        hf_pdf_repo = os.getenv("HF_RELIEFWEB_PDFS_REPO")  # e.g., "ai4data/reliefweb-pdfs"
+        if hf_pdf_repo:
+            pdf_url_base = f"https://huggingface.co/datasets/{hf_pdf_repo}/resolve/main/"
+            print(f"🌐 Using HF PDF repository: {hf_pdf_repo}", flush=True)
+            print(f"   PDF URL base: {pdf_url_base}", flush=True)
+        else:
+            print("⚠️ No PDF source configured. Set --pdf-dir, --pdf-url-base, or HF_RELIEFWEB_PDFS_REPO.", flush=True)
     # Create and launch the app
+    app = create_app(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base)
+    # Ensure allowed paths are absolute for Gradio (only needed for local files)
+    allowed = []
+    if pdf_dir:
+        pdf_dir_parent = str(Path(pdf_dir).parent.resolve())
+        allowed = [pdf_dir_parent]
+        print(f"🚀 Launching with allowed_paths: {allowed}", flush=True)
+        print(f"📂 PDF Directory Check: {Path(pdf_dir).exists()}", flush=True)
+    else:
+        print("🚀 Launching with remote PDF URLs (no local allowed_paths needed)", flush=True)
+    app.launch(allowed_paths=allowed)

upload_pdfs.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python3
+"""
+Upload PDFs to Hugging Face Datasets.
+Usage:
+    python upload_pdfs.py --repo-id your-username/reliefweb-pdfs --pdf-dir ./unchr_reliefweb_pdfs
+Options:
+    --max-size-mb: Skip files larger than this (default: 50MB)
+    --batch-size: Upload in batches of N files (default: 100)
+Environment:
+    HF_TOKEN: Your Hugging Face write token (or use --token flag)
+"""
+import argparse
+import os
+import tempfile
+import shutil
+from pathlib import Path
+from dotenv import load_dotenv
+from huggingface_hub import HfApi, login
+# Load environment variables from .env file
+load_dotenv()
+def upload_pdfs(repo_id: str, pdf_dir: str, token: str = None, private: bool = True,
+                max_size_mb: float = 50, batch_size: int = 100):
+    """Upload a folder of PDFs to a Hugging Face Dataset repository."""
+    pdf_path = Path(pdf_dir)
+    if not pdf_path.exists():
+        raise FileNotFoundError(f"PDF directory not found: {pdf_dir}")
+    # Get all PDFs and filter by size
+    all_pdfs = list(pdf_path.glob("*.pdf"))
+    max_size_bytes = max_size_mb * 1024 * 1024
+    valid_pdfs = []
+    skipped_pdfs = []
+    for pdf in all_pdfs:
+        size = pdf.stat().st_size
+        if size <= max_size_bytes:
+            valid_pdfs.append(pdf)
+        else:
+            skipped_pdfs.append((pdf.name, size / (1024 * 1024)))
+    print(f"📁 Found {len(all_pdfs)} PDF files in {pdf_dir}")
+    print(f"✅ Will upload: {len(valid_pdfs)} files (under {max_size_mb}MB)")
+    if skipped_pdfs:
+        print(f"⚠️  Skipping {len(skipped_pdfs)} files (too large):")
+        for name, size in skipped_pdfs[:5]:  # Show first 5
+            print(f"   - {name}: {size:.1f}MB")
+        if len(skipped_pdfs) > 5:
+            print(f"   ... and {len(skipped_pdfs) - 5} more")
+    if not valid_pdfs:
+        print("❌ No valid PDF files to upload. Exiting.")
+        return
+    # Login to HF
+    hf_token = token or os.getenv("HF_TOKEN")
+    if not hf_token:
+        raise ValueError("HF_TOKEN not set. Pass --token or set HF_TOKEN environment variable.")
+    login(token=hf_token, add_to_git_credential=False)
+    api = HfApi()
+    # Create repo if it doesn't exist
+    try:
+        api.create_repo(repo_id, repo_type="dataset", private=private, exist_ok=True)
+        print(f"✅ Repository ready: https://huggingface.co/datasets/{repo_id}")
+    except Exception as e:
+        print(f"⚠️ Repo creation note: {e}")
+    # Upload in batches
+    total_batches = (len(valid_pdfs) + batch_size - 1) // batch_size
+    for batch_num in range(total_batches):
+        start_idx = batch_num * batch_size
+        end_idx = min(start_idx + batch_size, len(valid_pdfs))
+        batch_files = valid_pdfs[start_idx:end_idx]
+        print(f"\n🚀 Uploading batch {batch_num + 1}/{total_batches} ({len(batch_files)} files)...")
+        # Create temp directory with just this batch
+        with tempfile.TemporaryDirectory() as temp_dir:
+            for pdf in batch_files:
+                shutil.copy2(pdf, temp_dir)
+            api.upload_folder(
+                folder_path=temp_dir,
+                repo_id=repo_id,
+                repo_type="dataset",
+                commit_message=f"Upload batch {batch_num + 1}/{total_batches} ({len(batch_files)} PDFs)",
+            )
+        print(f"   ✅ Batch {batch_num + 1} complete")
+    print(f"\n🎉 Upload complete! {len(valid_pdfs)} files uploaded.")
+    print(f"📎 View at: https://huggingface.co/datasets/{repo_id}")
+    print(f"\n💡 To use in app, set:")
+    print(f"   --pdf-url-base https://huggingface.co/datasets/{repo_id}/resolve/main/")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Upload PDFs to Hugging Face Datasets")
+    parser.add_argument("--repo-id", "-r", required=True,
+                        help="HF dataset repo ID (e.g., username/reliefweb-pdfs)")
+    parser.add_argument("--pdf-dir", "-d", required=True,
+                        help="Local directory containing PDF files")
+    parser.add_argument("--token", "-t", default=None,
+                        help="HF write token (or set HF_TOKEN env var)")
+    parser.add_argument("--public", action="store_true",
+                        help="Make the dataset public (default: private)")
+    parser.add_argument("--max-size-mb", type=float, default=50,
+                        help="Skip files larger than this (MB, default: 50)")
+    parser.add_argument("--batch-size", type=int, default=100,
+                        help="Upload in batches of N files (default: 100)")
+    args = parser.parse_args()
+    upload_pdfs(
+        repo_id=args.repo_id,
+        pdf_dir=args.pdf_dir,
+        token=args.token,
+        private=not args.public,
+        max_size_mb=args.max_size_mb,
+        batch_size=args.batch_size
+    )