#!/usr/bin/env python3 """ Gradio app for validating dataset mentions from stratified validation sample. This app allows users to: 1. Review dataset mentions with context 2. Validate as dataset or non-dataset 3. Compare extraction model vs judge (GPT-5.2) 4. Track validation progress with live statistics Adapted from annotation_app.py for direct_judge validation workflow. Configured for Hugging Face Spaces deployment. """ import gradio as gr import json import re import os import argparse from pathlib import Path from dotenv import load_dotenv # Load .env for local development load_dotenv() try: from gradio_pdf import PDF as gr_pdf except ImportError: gr_pdf = None from typing import Dict, List, Tuple, Optional from datetime import datetime from huggingface_hub import HfApi, login from datasets import Dataset, load_dataset class ValidationAnnotator: """ Handle validation annotation logic and state management. Note: This works with stratified validation samples from direct_judge outputs. No 4o data available - only judge (GPT-5.2) verdicts are shown. """ def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None, pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None, pdf_repo_id: Optional[str] = None): self.input_file = Path(input_file) self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl" # HF Datasets integration self.hf_dataset_repo = hf_dataset_repo self.hf_token = hf_token or os.getenv("HF_TOKEN") # PDF configuration self.pdf_dir = Path(pdf_dir) if pdf_dir else None self.pdf_url_base = pdf_url_base self.pdf_repo_id = pdf_repo_id if self.pdf_dir and not self.pdf_dir.exists(): print(f"⚠️ PDF directory not found: {self.pdf_dir}") self.hf_enabled = False # Try to enable HF Datasets if credentials provided if self.hf_dataset_repo and self.hf_token: try: login(token=self.hf_token, add_to_git_credential=False) self.hf_api = HfApi() self.hf_enabled = True print(f"✅ HF Datasets enabled: {self.hf_dataset_repo}") except Exception as e: print(f"⚠️ HF Datasets disabled: {e}") self.hf_enabled = False # Load data self.records = self._load_records() self.annotations = self._load_annotations() # Build chunk index for navigation self._build_chunk_index() # Current position self.current_idx = 0 # Filter state self.current_filter = "All" # Options: "All", "named", "descriptive", "vague", "non-dataset" self.filtered_indices = list(range(len(self.records))) # All records by default # Move to first unannotated record self._find_next_unannotated() def _load_records(self) -> List[Dict]: """Load records from input JSONL file.""" records = [] with open(self.input_file, 'r', encoding='utf-8') as f: for line in f: if line.strip(): records.append(json.loads(line)) return records def _build_chunk_index(self): """Build index mapping chunk_id to record indices.""" self.chunk_ids = [] # Ordered list of unique chunk_ids self.chunk_to_indices = {} # chunk_id -> list of record indices for idx, record in enumerate(self.records): chunk_id = record.get('chunk_id', f'unknown_{idx}') if chunk_id not in self.chunk_to_indices: self.chunk_ids.append(chunk_id) self.chunk_to_indices[chunk_id] = [] self.chunk_to_indices[chunk_id].append(idx) self.total_chunks = len(self.chunk_ids) self.total_datasets = len(self.records) def _get_chunk_info(self, idx: int) -> Tuple[int, int, int]: """Get chunk info for a given record index. Returns: (chunk_number, dataset_in_chunk, total_in_chunk) """ if idx >= len(self.records): return (0, 0, 0) record = self.records[idx] chunk_id = record.get('chunk_id', f'unknown_{idx}') chunk_number = self.chunk_ids.index(chunk_id) + 1 if chunk_id in self.chunk_ids else 0 chunk_indices = self.chunk_to_indices.get(chunk_id, [idx]) dataset_in_chunk = chunk_indices.index(idx) + 1 if idx in chunk_indices else 1 total_in_chunk = len(chunk_indices) return (chunk_number, dataset_in_chunk, total_in_chunk) def _load_annotations(self) -> Dict: """Load existing annotations from local file and/or HF Datasets.""" annotations = {} # Try loading from HF Datasets first (cloud backup) if self.hf_enabled: try: dataset = load_dataset(self.hf_dataset_repo, split="train", token=self.hf_token) for item in dataset: annotations[item['sample_id']] = item print(f"✅ Loaded {len(annotations)} annotations from HF Datasets") except Exception as e: print(f"⚠️ Could not load from HF Datasets: {e}") # Also load from local file (may have newer annotations) if self.output_file.exists(): local_count = 0 with open(self.output_file, 'r', encoding='utf-8') as f: for line in f: if line.strip(): ann = json.loads(line) annotations[ann['sample_id']] = ann local_count += 1 if local_count > 0: print(f"✅ Loaded {local_count} annotations from local file") return annotations def _save_annotation(self, sample_id: int, verdict: str, notes: str = ""): """Save a single annotation to file and optionally to HF Datasets.""" record = self.records[self.current_idx] # Determine if extraction/judge said dataset # Dataset = any tag that is NOT "non-dataset" (includes named, descriptive, vague) extraction_is_dataset = record['extraction_tag'] != 'non-dataset' judge_is_dataset = record['judge_tag'] != 'non-dataset' human_is_dataset = verdict == 'dataset' annotation = { 'sample_id': sample_id, 'text': record['text'], 'document': record['document'], 'stratum': record['stratum'], # Human annotation 'human_verdict': verdict, # 'dataset' or 'non-dataset' 'human_notes': notes, 'annotated_at': datetime.now().isoformat(), # Original extraction 'extraction_tag': record['extraction_tag'], 'extraction_confidence': record['extraction_confidence'], # Judge (GPT-5.2) 'judge_tag': record['judge_tag'], 'judge_confidence': record['judge_confidence'], 'judge_reasoning': record.get('judge_reasoning', ''), 'judge_data_type': record.get('judge_data_type', ''), # Computed agreements 'human_agrees_extraction': human_is_dataset == extraction_is_dataset, 'human_agrees_judge': human_is_dataset == judge_is_dataset, 'extraction_agrees_judge': extraction_is_dataset == judge_is_dataset, } # Update in-memory annotations self.annotations[sample_id] = annotation # Append to local file with open(self.output_file, 'a', encoding='utf-8') as f: f.write(json.dumps(annotation, ensure_ascii=False) + '\n') # Push to HF Datasets (async backup) if self.hf_enabled: try: self._push_to_hf_datasets() except Exception as e: print(f"⚠️ Failed to push to HF Datasets: {e}") def _push_to_hf_datasets(self): """Push all annotations to HF Datasets.""" if not self.hf_enabled or not self.annotations: return try: # Convert annotations dict to list annotations_list = list(self.annotations.values()) # Create dataset dataset = Dataset.from_list(annotations_list) # Push to hub dataset.push_to_hub( self.hf_dataset_repo, token=self.hf_token, private=True # Keep annotations private by default ) print(f"✅ Pushed {len(annotations_list)} annotations to HF Datasets") except Exception as e: print(f"⚠️ Error pushing to HF Datasets: {e}") raise def _split_sentences(self, text: str) -> list: """Split text into sentences using a simple rule-based approach.""" # Split on period/question/exclamation followed by whitespace, or paragraph breaks chunks = re.split(r'(?<=[.!?])\s+|\n\s*\n', text) return [c.strip() for c in chunks if c.strip()] def _extract_context(self, text: str, dataset_name: str, context_sentences: int = 2) -> list: """ Extract context around dataset mention and format for highlighting. Uses sentence-based windowing: returns the sentence containing the dataset plus context_sentences before and after (default: ±2 sentences). Returns: List of tuples: [(text, label), ...] where label is "DATASET" for the dataset name """ if not text: return [(f"[No context available for '{dataset_name}']", None)] # Normalize text: remove excessive whitespace but preserve sentence structure text = re.sub(r'\s+', ' ', text).strip() dataset_name_clean = re.sub(r'\s+', ' ', dataset_name).strip() # Split into sentences sentences = self._split_sentences(text) if not sentences: return [(text[:500] + "..." if len(text) > 500 else text, None)] # Create regex to match name with flexible whitespace name_parts = dataset_name_clean.split() if not name_parts: return [(text[:500] + "..." if len(text) > 500 else text, None)] pattern_str = r'\s+'.join([re.escape(part) for part in name_parts]) pattern = re.compile(pattern_str, re.IGNORECASE) # Find sentence containing the dataset name target_idx = None for i, sent in enumerate(sentences): if pattern.search(sent): target_idx = i break if target_idx is None: # Fallback: return truncated text without highlighting return [(text[:500] + "..." if len(text) > 500 else text, None)] # Get ±context_sentences around the match start_idx = max(0, target_idx - context_sentences) end_idx = min(len(sentences), target_idx + context_sentences + 1) # Join the context sentences context_text = " ".join(sentences[start_idx:end_idx]) # Add ellipsis indicators prefix = "..." if start_idx > 0 else "" suffix = "..." if end_idx < len(sentences) else "" # Find the dataset name in the context for highlighting match = pattern.search(context_text) if not match: # Return without highlighting if somehow not found return [(prefix + context_text + suffix, None)] # Build highlighted output before = prefix + context_text[:match.start()] dataset = context_text[match.start():match.end()] after = context_text[match.end():] + suffix return [ (before, None), (dataset, "DATASET"), (after, None) ] def set_filter(self, filter_value: str): """Set the current filter and update filtered indices. When 'All' is selected: Show all records including siblings When a specific tag is selected: Show only primary samples with that tag (no siblings) """ self.current_filter = filter_value if filter_value == "All": # Show all records including siblings self.filtered_indices = list(range(len(self.records))) else: # Filter by extraction_tag only (not judge_tag) # AND exclude siblings (only show primary samples) self.filtered_indices = [ i for i, record in enumerate(self.records) if record.get('extraction_tag') == filter_value and record.get('is_primary', True) # Only primary samples, not siblings ] # Always jump to first unannotated record in the new filtered set for determinism self._find_next_unannotated() def _is_annotated(self, idx: int) -> bool: """Check if a record has been annotated.""" sample_id = self.records[idx].get('sample_id', idx) return sample_id in self.annotations def _should_skip(self, idx: int) -> bool: """Check if record is a one-word vague/descriptive that should be skipped.""" if idx >= len(self.records): return False record = self.records[idx] text = record.get('text', '') word_count = len(text.split()) ext_tag = record.get('extraction_tag', '') judge_tag = record.get('judge_tag', '') # Skip one-word vague/descriptive mentions skip_tags = {'vague', 'descriptive'} if word_count == 1 and (ext_tag in skip_tags or judge_tag in skip_tags): return True return False def _find_next_unannotated(self): """Find the next unannotated record within the current filtered set.""" if not self.filtered_indices: self.current_idx = len(self.records) return for idx in self.filtered_indices: if not self._is_annotated(idx) and not self._should_skip(idx): self.current_idx = idx return # All filtered records are annotated or skippable, go to the first filtered one if we have any # or stick to the end if we want to show the completion screen. # Actually, let's go to the last filtered one if all are annotated. if self.filtered_indices: self.current_idx = self.filtered_indices[0] else: self.current_idx = len(self.records) def get_current_display(self) -> Tuple[str, list, str, str, str, str, Dict, str]: """Get current record for display.""" if self.current_idx >= len(self.records): return "🎉 All samples validated!", [], "", "", f"Progress: {len(self.annotations)}/{len(self.records)} (100%)", "✅ Complete", {}, "" record = self.records[self.current_idx] # Get context with highlighting context = self._extract_context( record.get('full_context', '') or record.get('usage_context', ''), record['text'] ) # Build AI verdicts (Judge only - no 4o in direct_judge) # Dataset = any tag that is NOT "non-dataset" (includes named, descriptive, vague) ai_verdicts_str = "" # Extraction model verdict # Dataset if tag is NOT "non-dataset" ext_tag = record['extraction_tag'] ext_is_dataset = ext_tag != 'non-dataset' ext_emoji = "✓" if ext_is_dataset else "✗" ai_verdicts_str = f"### Extraction Model:\n" ai_verdicts_str += f"**Verdict:** {ext_emoji} {'Dataset' if ext_is_dataset else 'Non-Dataset'}\n" ai_verdicts_str += f"**Tag:** `{ext_tag}`\n" ai_verdicts_str += f"**Confidence:** {record['extraction_confidence']:.1%}\n" # Judge (GPT-5.2) verdict # Dataset if tag is NOT "non-dataset" judge_tag = record['judge_tag'] judge_is_dataset = judge_tag != 'non-dataset' judge_emoji = "✓" if judge_is_dataset else "✗" ai_verdicts_str += f"\n### Judge (GPT-5.2):\n" ai_verdicts_str += f"**Verdict:** {judge_emoji} {'Dataset' if judge_is_dataset else 'Non-Dataset'}\n" ai_verdicts_str += f"**Tag:** `{judge_tag}`\n" ai_verdicts_str += f"**Confidence:** {record['judge_confidence']:.1%}\n" if record.get('judge_data_type'): ai_verdicts_str += f"**Data Type:** {record['judge_data_type']}\n" if record.get('judge_reasoning'): reasoning = record['judge_reasoning'] ai_verdicts_str += f"\n*Reasoning:* {reasoning}..." # Metadata # Metadata metadata_parts = [] metadata_parts.append(f"- **Stratum:** `{record['stratum']}`") # metadata_parts.append(f"- **Document:** `{record['document']}...`") if record.get("source_document"): metadata_parts.append(f"- **Source File:** `{record.get('source_document')}`") if record.get("page_number"): metadata_parts.append(f"- **Page(s):** {record.get('page_number')}") is_primary = record.get('is_primary', True) metadata_parts.append(f"- **Type:** {'Primary sample' if is_primary else 'Sibling (same chunk)'}") if record.get('geography'): geo = record['geography'] if isinstance(geo, dict): geo = geo.get('text', str(geo)) metadata_parts.append(f"- **Geography:** {geo}") metadata_str = "\n".join(metadata_parts) # Get chunk info chunk_num, ds_in_chunk, total_in_chunk = self._get_chunk_info(self.current_idx) # Progress: N/N-max datasets annotated = len(self.annotations) progress = f"Datasets: {annotated}/{self.total_datasets} ({annotated/self.total_datasets*100:.1f}%)" # Status is_annotated = self._is_annotated(self.current_idx) if is_annotated: ann = self.annotations.get(record.get('sample_id', self.current_idx), {}) status = f"✅ Validated as: {ann.get('human_verdict', 'unknown')}" else: status = "❓ Pending Validation" # Navigation info with chunk details nav = { 'chunk_info': f"Input Text: {chunk_num}/{self.total_chunks}", 'dataset_in_chunk': f"Dataset: {ds_in_chunk}/{total_in_chunk} in this chunk", 'record_info': f"Overall: {self.current_idx + 1}/{self.total_datasets}", 'can_prev': self.current_idx > 0, 'can_next': self.current_idx < self.total_datasets - 1 } # PDF Source path and page source_doc = record.get("source_document") page_num = record.get("page_number") pdf_value = None # Convert page_num to int and add 1 (offset from 0-indexed data) try: if page_num: page_num = int(page_num) + 1 else: page_num = 1 except (ValueError, TypeError): page_num = 1 if source_doc and self.pdf_dir: # Local PDF directory pdf_path = self.pdf_dir / source_doc if pdf_path.exists(): pdf_value = str(pdf_path.absolute()) print(f"📄 Found PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True) else: print(f"⚠️ PDF file not found: {pdf_path}", flush=True) elif source_doc and self.pdf_repo_id: # Server-side caching via HF Hub (avoids CORS/frontend download issues) # Remove leading slash if present source_doc_clean = source_doc.lstrip('/') try: from huggingface_hub import hf_hub_download print(f"📥 Downloading/Caching PDF from {self.pdf_repo_id}: {source_doc_clean}", flush=True) pdf_path_cached = hf_hub_download( repo_id=self.pdf_repo_id, filename=source_doc_clean, repo_type="dataset", token=self.hf_token ) pdf_value = str(pdf_path_cached) print(f"📦 Cached local path: {pdf_value}", flush=True) except Exception as e: print(f"❌ Failed to download PDF: {e}", flush=True) # Fallback to URL if download fails and url base is available if self.pdf_url_base: pdf_value = f"{self.pdf_url_base.rstrip('/')}/{source_doc_clean}" print(f"⚠️ Falling back to remote URL: {pdf_value}", flush=True) elif source_doc and self.pdf_url_base: # Remote PDF via URL (e.g., HF Datasets) # Remove any leading slashes from source_doc source_doc_clean = source_doc.lstrip('/') pdf_value = f"{self.pdf_url_base.rstrip('/')}/{source_doc_clean}" print(f"🌐 Using remote PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True) elif source_doc: print(f"ℹ️ PDF source specified ({source_doc}) but no pdf_dir or pdf_url_base provided.", flush=True) return record['text'], context, metadata_str, ai_verdicts_str, progress, status, nav, pdf_value, page_num def annotate(self, verdict: str, notes: str = "") -> Tuple[str, list, str, str, str, str]: """Annotate current record and move to next.""" if self.current_idx < len(self.records): record = self.records[self.current_idx] sample_id = record.get('sample_id', self.current_idx) self._save_annotation(sample_id, verdict, notes) self.next_record() return self.get_current_display()[:6] def next_record(self): """Move to next record in the filtered set.""" if not self.filtered_indices: return try: current_pos = self.filtered_indices.index(self.current_idx) if current_pos < len(self.filtered_indices) - 1: self.current_idx = self.filtered_indices[current_pos + 1] except ValueError: # Current idx not in filtered set (maybe filter changed), jump to first self.current_idx = self.filtered_indices[0] def prev_record(self): """Move to previous record in the filtered set.""" if not self.filtered_indices: return try: current_pos = self.filtered_indices.index(self.current_idx) if current_pos > 0: self.current_idx = self.filtered_indices[current_pos - 1] except ValueError: # Current idx not in filtered set, jump to first self.current_idx = self.filtered_indices[0] def skip_to_next_unannotated(self): """Skip to next unannotated record (also skipping one-word vague/descriptive).""" for i in range(self.current_idx + 1, len(self.records)): if not self._is_annotated(i) and not self._should_skip(i): self.current_idx = i return def get_statistics(self) -> str: """Get current annotation statistics as markdown.""" if not self.annotations: return "_No annotations yet_" total = len(self.annotations) human_dataset = sum(1 for a in self.annotations.values() if a['human_verdict'] == 'dataset') human_non = total - human_dataset agrees_ext = sum(1 for a in self.annotations.values() if a['human_agrees_extraction']) agrees_judge = sum(1 for a in self.annotations.values() if a['human_agrees_judge']) stats = f"""**Annotated:** {total}/{len(self.records)} **Human Verdicts:** - Dataset: {human_dataset} - Non-Dataset: {human_non} **Agreement Rates:** - Extraction Model: {agrees_ext/total*100:.1f}% - Judge (GPT-5.2): {agrees_judge/total*100:.1f}% """ return stats def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None, pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None, pdf_repo_id: Optional[str] = None): """Create and configure Gradio app.""" annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base, pdf_repo_id) # Custom CSS for the green button and dark mode toggle css = """ #accept_btn { background-color: #22c55e !important; color: white !important; } #accept_btn:hover { background-color: #16a34a !important; } #theme_toggle { position: fixed; top: 10px; right: 10px; z-index: 1000; padding: 8px 16px; border-radius: 20px; cursor: pointer; font-size: 14px; } """ # JavaScript for dark mode toggle js = """ function toggleDarkMode() { const body = document.body; const isDark = body.classList.contains('dark'); if (isDark) { body.classList.remove('dark'); localStorage.setItem('theme', 'light'); document.getElementById('theme_toggle').textContent = '🌙 Dark Mode'; } else { body.classList.add('dark'); localStorage.setItem('theme', 'dark'); document.getElementById('theme_toggle').textContent = '☀️ Light Mode'; } } // Apply saved theme on load document.addEventListener('DOMContentLoaded', function() { const savedTheme = localStorage.getItem('theme'); if (savedTheme === 'dark') { document.body.classList.add('dark'); const btn = document.getElementById('theme_toggle'); if (btn) btn.textContent = '☀️ Light Mode'; } // Force resize when switching to Annotate tab to help PDF viewer document.body.addEventListener('click', function(e) { if (e.target && e.target.innerText && e.target.innerText.includes('Annotate')) { console.log('Annotate tab clicked - forcing resize'); setTimeout(() => { window.dispatchEvent(new Event('resize')); // Also try to find any canvases and nudge them document.querySelectorAll('canvas').forEach(c => { c.dispatchEvent(new Event('resize')); }); }, 500); } }, true); }); """ with gr.Blocks(title="Dataset Annotation Tool", css=css, js=js) as app: # Theme toggle button gr.HTML('') gr.Markdown("# Dataset Annotation Tool") with gr.Tabs(): # Tab 1: Introduction and Instructions with gr.Tab("📖 Introduction & Instructions"): gr.Markdown(""" ## Welcome to the Dataset Annotation Tool This tool helps validate dataset mentions extracted from UNHCR and ReliefWeb documents. Your annotations will improve the accuracy of our dataset extraction model. ### What You'll Be Annotating You'll review **candidate dataset mentions** that our AI model has identified in humanitarian documents. Your task is to determine whether each mention is: - ✅ **A Dataset**: A collection of data that can be referenced, analyzed, or used (e.g., surveys, databases, statistical reports) - ❌ **Not a Dataset**: A document title, framework, strategy, or general reference that doesn't represent actual data ### About the Data - **Source**: UNHCR and ReliefWeb PDF documents - **Sampling**: Stratified sample across different mention types (named, descriptive, vague) - **AI Models**: - **Extraction Model**: Fine-tuned model that identified these mentions - **Judge (GPT-5.2)**: LLM-based validator that reviewed the extractions ### How to Annotate 1. **Review the Mention**: Read the **Dataset Name** and examine the **Context** (highlighted in yellow) 2. **Check Metadata**: Review document source, stratum, and geography information 3. **Compare AI Predictions** (Optional): Toggle "🤖 Show what the AI thinks" to see model predictions 4. **Make Your Decision**: - Click **✓ DATASET** (green) if it's a valid dataset - Click **✗ NOT A DATASET** (red) if it's not a dataset 5. **Add Notes** (Optional): Document your reasoning for ambiguous cases 6. **Navigate**: Use Previous/Next buttons or skip to unannotated samples 7. **Save Progress**: - Click **💾 Download Annotations** to backup locally - Auto-backup to HF Datasets (if configured) ### What Makes Something a Dataset? ✅ **IS a Dataset:** - Survey data (e.g., "UNHCR Household Survey 2023") - Statistical databases (e.g., "Population Statistics Database") - Assessment results with data (e.g., "Needs Assessment 2024" when cited as data source) - Index datasets (e.g., "Multidimensional Poverty Index") - Monitoring data (e.g., "Protection Monitoring Data") ❌ **NOT a Dataset:** - Report titles (e.g., "Global Trends Report 2024" as a publication) - Frameworks/strategies (e.g., "Global Compact on Refugees") - Assessment activities (e.g., "Rapid Assessment" as the activity itself) - General document references ### Tips for Accuracy - **Context is key**: The same term can be a dataset or not depending on usage - **Look for data indicators**: Numbers, statistics, "based on", "source:", "data from" - **When in doubt**: Add a note explaining your reasoning - **Be consistent**: Use the same criteria throughout your annotation session ### Your Impact Your annotations will: - Improve model precision and recall - Help identify patterns in false positives/negatives - Create training data for the next model version - Support better dataset discovery in humanitarian documents --- **Ready to start?** Click the **"Annotate"** tab above to begin! """) # Get initial values for robust first render init_name, init_context, init_metadata, init_ai, init_progress, init_status, init_nav, init_pdf, init_page = annotator.get_current_display() init_chunk_info = init_nav.get('chunk_info', '') init_dataset_in_chunk = init_nav.get('dataset_in_chunk', '') init_stats = annotator.get_statistics() # Tab 2: Annotation Interface with gr.Tab("✏️ Annotate") as annotate_tab: gr.Markdown("Review and annotate dataset mentions. PDF viewer is below for reference.") # Top Section: Annotation Controls with gr.Row(): # Dataset Info & Context with gr.Column(scale=3): dataset_name = gr.Textbox(label="Dataset Name", value=init_name, interactive=False, max_lines=2) context_box = gr.HighlightedText( label="Context (±2 sentences, dataset highlighted)", value=init_context, color_map={"DATASET": "yellow"}, show_legend=False, combine_adjacent=True ) metadata_box = gr.Markdown(init_metadata, label="Metadata") show_ai_checkbox = gr.Checkbox(label="🤖 Show what the AI thinks", value=False) ai_verdicts_box = gr.Markdown(init_ai, label="AI Analysis", visible=False) # Controls & Progress with gr.Column(scale=2): # Filter dropdown filter_dropdown = gr.Dropdown( choices=["All", "named", "descriptive", "vague", "non-dataset"], value="All", label="🔍 Filter by Tag Type", interactive=True ) progress_box = gr.Textbox(label="Progress", value=init_progress, interactive=False, lines=1) chunk_info_box = gr.Textbox(label="Input Text Position", value=init_chunk_info, interactive=False, lines=1) dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", value=init_dataset_in_chunk, interactive=False, lines=1) status_box = gr.Textbox(label="Status", value=init_status, interactive=False, lines=1) notes_box = gr.Textbox( label="Notes (optional)", placeholder="Add any comments about this dataset...", lines=3 ) with gr.Row(): accept_btn = gr.Button("✓ DATASET", variant="primary", size="lg", elem_id="accept_btn") reject_btn = gr.Button("✗ NOT A DATASET", variant="stop", size="lg") gr.Markdown("---") with gr.Row(): prev_btn = gr.Button("← Previous", size="sm") next_btn = gr.Button("Next →", size="sm") skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm") with gr.Accordion("📊 Live Statistics", open=False): stats_box = gr.Markdown(init_stats) # Download button for manual backup download_btn = gr.DownloadButton( "💾 Download Annotations", value=str(annotator.output_file) if annotator.output_file.exists() else None, size="sm", variant="secondary" ) # HF Datasets status if annotator.hf_enabled: gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})") else: gr.Markdown("⚠️ **Auto-backup disabled**") gr.Markdown(f"**Input:** `{Path(input_file).name}`") gr.Markdown("---") # Bottom Section: PDF Viewer (Full Width) with gr.Row(): with gr.Column(scale=1): if gr_pdf is None: gr.Markdown("### ⚠️ `gradio-pdf` not found\nPlease run `uv pip install gradio-pdf` and restart.") pdf_viewer = gr.HTML(visible=False) else: # Use gradio-pdf component pdf_viewer = gr_pdf( label="Source Document", height=1000, visible=True ) refresh_pdf_btn = gr.Button("🔄 Reload PDF Viewer", size="sm") # Hidden PDF component to authorize file serving if annotator.pdf_dir: gr.File(value=None, visible=False, interactive=False) nav_state = gr.State({}) def update_display(): print(f"📡 Updating display for index {annotator.current_idx}...", flush=True) name, context, metadata, ai_verdicts, progress, status, nav, pdf_path, page_num = annotator.get_current_display() chunk_info = nav.get('chunk_info', '') dataset_in_chunk = nav.get('dataset_in_chunk', '') stats = annotator.get_statistics() # Use gr.update for gradio_pdf component pdf_update = gr.update(value=pdf_path, starting_page=page_num) print(f"🖼️ PDF Update: path={pdf_path}, page={page_num}", flush=True) return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats, pdf_update def accept_and_next(notes): name, context, metadata, ai_verdicts, progress, status = annotator.annotate('dataset', notes) _, _, _, _, _, _, nav, pdf_value, page_num = annotator.get_current_display() chunk_info = nav.get('chunk_info', '') dataset_in_chunk = nav.get('dataset_in_chunk', '') stats = annotator.get_statistics() # Use gr.update for gradio_pdf component pdf_update = gr.update(value=pdf_value, starting_page=page_num) return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats, pdf_update def reject_and_next(notes): name, context, metadata, ai_verdicts, progress, status = annotator.annotate('non-dataset', notes) _, _, _, _, _, _, nav, pdf_value, page_num = annotator.get_current_display() chunk_info = nav.get('chunk_info', '') dataset_in_chunk = nav.get('dataset_in_chunk', '') stats = annotator.get_statistics() # Use gr.update for gradio_pdf component pdf_update = gr.update(value=pdf_value, starting_page=page_num) return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats, pdf_update def go_next(): annotator.next_record() return update_display() def go_prev(): annotator.prev_record() return update_display() def skip_unannotated(): annotator.skip_to_next_unannotated() return update_display() def toggle_ai_verdicts(show_ai): if show_ai: # Get current AI verdicts content display_data = annotator.get_current_display() ai_verdicts = display_data[3] # ai_verdicts_str is the 4th value return gr.update(visible=True, value=ai_verdicts) return gr.update(visible=False) def get_download_file(): """Return the path to the annotations file for download.""" if annotator.output_file.exists(): return str(annotator.output_file) return None # Outputs - updated with chunk_info and dataset_in_chunk # Outputs - updated with chunk_info and dataset_in_chunk outputs_list = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, nav_state, stats_box, pdf_viewer] outputs_annotate = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, notes_box, nav_state, stats_box, pdf_viewer] accept_btn.click(accept_and_next, inputs=[notes_box], outputs=outputs_annotate).then( get_download_file, outputs=[download_btn] ) reject_btn.click(reject_and_next, inputs=[notes_box], outputs=outputs_annotate).then( get_download_file, outputs=[download_btn] ) next_btn.click(go_next, outputs=outputs_list) prev_btn.click(go_prev, outputs=outputs_list) skip_btn.click(skip_unannotated, outputs=outputs_list) def apply_filter(filter_value): annotator.set_filter(filter_value) return update_display() filter_dropdown.change(apply_filter, inputs=[filter_dropdown], outputs=outputs_list) show_ai_checkbox.change(toggle_ai_verdicts, inputs=[show_ai_checkbox], outputs=[ai_verdicts_box]) def initial_load_no_pdf(): """Initial load without PDF to avoid the blank page bug on first render. The PDF will be loaded when the user first clicks the Annotate tab.""" print("🚀 Initial app load - PDF set to None (will load on tab select)", flush=True) name, context, metadata, ai_verdicts, progress, status, nav, pdf_path, page_num = annotator.get_current_display() chunk_info = nav.get('chunk_info', '') dataset_in_chunk = nav.get('dataset_in_chunk', '') stats = annotator.get_statistics() # Return None for PDF to avoid initial render bug pdf_update = gr.update(value=None) return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats, pdf_update # Load data when app starts - WITHOUT PDF to avoid blank page bug app.load(initial_load_no_pdf, outputs=outputs_list) # When Annotate tab is selected, load the PDF (this is the "second update" that triggers proper render) annotate_tab.select(update_display, outputs=outputs_list) refresh_pdf_btn.click(update_display, outputs=outputs_list) return app # For Hugging Face Spaces deployment if __name__ == "__main__": # Parse command line arguments parser = argparse.ArgumentParser(description="Dataset Annotation Tool") parser.add_argument("--input", "-i", type=str, default="validation_sample_filtering_retained.jsonl", help="Input JSONL file (default: validation_sample_filtering_retained.jsonl)") parser.add_argument("--pdf-dir", "-p", type=str, default=None, help="Directory containing local PDF files (optional)") parser.add_argument("--pdf-url-base", "-u", type=str, default=None, help="Base URL for remote PDFs (if not using local files)") args = parser.parse_args() # Check if file exists input_file = args.input if not Path(input_file).exists(): raise FileNotFoundError( f"Input file '{input_file}' not found. " "Please ensure the data file is in the repository." ) # Get HF credentials from environment (set in Space secrets) hf_dataset_repo = os.getenv("HF_DATASET_REPO") # e.g., "username/reliefweb-annotations" hf_token = os.getenv("HF_TOKEN") # HF write token # Determine PDF source: command-line args take priority, then env vars pdf_dir = args.pdf_dir pdf_url_base = args.pdf_url_base # If no explicit PDF source, check for HF PDF repo environment variable pdf_repo_id = None if not pdf_dir and not pdf_url_base: hf_pdf_repo = os.getenv("HF_RELIEFWEB_PDFS_REPO") # e.g., "ai4data/reliefweb-pdfs" if hf_pdf_repo: # Handle both formats: repo ID or full URL if hf_pdf_repo.startswith("https://"): # Already a full URL, use it directly (ensure it ends with /) pdf_url_base = hf_pdf_repo.rstrip('/') + '/' else: # Repo ID format - enabling server-side caching! pdf_repo_id = hf_pdf_repo # Also set url base as fallback pdf_url_base = f"https://huggingface.co/datasets/{hf_pdf_repo}/resolve/main/" print(f"🌐 Using HF PDF repository: {hf_pdf_repo}", flush=True) if pdf_repo_id: print(f" 🚀 Server-side caching ENABLED for repo: {pdf_repo_id}", flush=True) print(f" PDF URL base (fallback): {pdf_url_base}", flush=True) else: print("⚠️ No PDF source configured. Set --pdf-dir, --pdf-url-base, or HF_RELIEFWEB_PDFS_REPO.", flush=True) # Create and launch the app app = create_app(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base, pdf_repo_id) # Ensure allowed paths are absolute for Gradio (only needed for local files) allowed = [] if pdf_dir: pdf_dir_parent = str(Path(pdf_dir).parent.resolve()) allowed = [pdf_dir_parent] print(f"🚀 Launching with allowed_paths: {allowed}", flush=True) print(f"📂 PDF Directory Check: {Path(pdf_dir).exists()}", flush=True) elif pdf_repo_id: # If caching from HF, we need to allow access to the HF cache directory # Typical path: ~/.cache/huggingface/hub # We'll allow the user's home directory to be safe/simple for now, # or we could try to resolve the specific cache path. # Allowing hierarchy up to home is usually robust for local caches. home_dir = str(Path.home().resolve()) allowed = [home_dir] print(f"🚀 Launching with cached HF PDFs - Allowing access to: {allowed}", flush=True) else: print("🚀 Launching with remote PDF URLs (no local allowed_paths needed)", flush=True) app.launch(allowed_paths=allowed, ssr_mode=False)