#!/usr/bin/env python3 """ Gradio app for validating dataset mentions from stratified validation sample. This app allows users to: 1. Review dataset mentions with context 2. Validate as dataset or non-dataset 3. Compare extraction model vs judge (GPT-5.2) 4. Track validation progress with live statistics Adapted from annotation_app.py for direct_judge validation workflow. Usage: python validation_annotation_app.py --input validation_sample.jsonl """ import gradio as gr import json import re from pathlib import Path from typing import Dict, List, Tuple, Optional import argparse from datetime import datetime class ValidationAnnotator: """ Handle validation annotation logic and state management. Note: This works with stratified validation samples from direct_judge outputs. No 4o data available - only judge (GPT-5.2) verdicts are shown. """ def __init__(self, input_file: str): self.input_file = Path(input_file) self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl" # Load data self.records = self._load_records() self.annotations = self._load_annotations() # Build chunk index for navigation self._build_chunk_index() # Current position self.current_idx = 0 # Move to first unannotated record self._find_next_unannotated() def _load_records(self) -> List[Dict]: """Load records from input JSONL file.""" records = [] with open(self.input_file, 'r', encoding='utf-8') as f: for line in f: if line.strip(): records.append(json.loads(line)) return records def _build_chunk_index(self): """Build index mapping chunk_id to record indices.""" self.chunk_ids = [] # Ordered list of unique chunk_ids self.chunk_to_indices = {} # chunk_id -> list of record indices for idx, record in enumerate(self.records): chunk_id = record.get('chunk_id', f'unknown_{idx}') if chunk_id not in self.chunk_to_indices: self.chunk_ids.append(chunk_id) self.chunk_to_indices[chunk_id] = [] self.chunk_to_indices[chunk_id].append(idx) self.total_chunks = len(self.chunk_ids) self.total_datasets = len(self.records) def _get_chunk_info(self, idx: int) -> Tuple[int, int, int]: """Get chunk info for a given record index. Returns: (chunk_number, dataset_in_chunk, total_in_chunk) """ if idx >= len(self.records): return (0, 0, 0) record = self.records[idx] chunk_id = record.get('chunk_id', f'unknown_{idx}') chunk_number = self.chunk_ids.index(chunk_id) + 1 if chunk_id in self.chunk_ids else 0 chunk_indices = self.chunk_to_indices.get(chunk_id, [idx]) dataset_in_chunk = chunk_indices.index(idx) + 1 if idx in chunk_indices else 1 total_in_chunk = len(chunk_indices) return (chunk_number, dataset_in_chunk, total_in_chunk) def _load_annotations(self) -> Dict: """Load existing annotations if available.""" annotations = {} if self.output_file.exists(): with open(self.output_file, 'r', encoding='utf-8') as f: for line in f: if line.strip(): ann = json.loads(line) annotations[ann['sample_id']] = ann return annotations def _save_annotation(self, sample_id: int, verdict: str, notes: str = ""): """Save a single annotation to file.""" record = self.records[self.current_idx] # Determine if extraction/judge said dataset # Dataset = any tag that is NOT "non-dataset" (includes named, descriptive, vague) extraction_is_dataset = record['extraction_tag'] != 'non-dataset' judge_is_dataset = record['judge_tag'] != 'non-dataset' human_is_dataset = verdict == 'dataset' annotation = { 'sample_id': sample_id, 'text': record['text'], 'document': record['document'], 'stratum': record['stratum'], # Human annotation 'human_verdict': verdict, # 'dataset' or 'non-dataset' 'human_notes': notes, 'annotated_at': datetime.now().isoformat(), # Original extraction 'extraction_tag': record['extraction_tag'], 'extraction_confidence': record['extraction_confidence'], # Judge (GPT-5.2) 'judge_tag': record['judge_tag'], 'judge_confidence': record['judge_confidence'], 'judge_reasoning': record.get('judge_reasoning', ''), 'judge_data_type': record.get('judge_data_type', ''), # Computed agreements 'human_agrees_extraction': human_is_dataset == extraction_is_dataset, 'human_agrees_judge': human_is_dataset == judge_is_dataset, 'extraction_agrees_judge': extraction_is_dataset == judge_is_dataset, } # Update in-memory annotations self.annotations[sample_id] = annotation # Append to file with open(self.output_file, 'a', encoding='utf-8') as f: f.write(json.dumps(annotation, ensure_ascii=False) + '\n') def _extract_context(self, text: str, dataset_name: str, context_sentences: int = 1) -> list: """ Extract context around dataset mention and format for highlighting. Returns: List of tuples: [(text, label), ...] where label is "DATASET" for the dataset name """ if not text: return [(f"[No context available for '{dataset_name}']", None)] # Normalize text: remove line breaks and extra whitespace text = re.sub(r'\s+', ' ', text).strip() dataset_name_clean = re.sub(r'\s+', ' ', dataset_name).strip() # Find the dataset name in text (case-insensitive) pattern = re.escape(dataset_name_clean) match = re.search(pattern, text, re.IGNORECASE) if not match: # Return full context without highlighting return [(text[:500] + "..." if len(text) > 500 else text, None)] # Get position of match start_pos = match.start() end_pos = match.end() # Get context around match context_start = max(0, start_pos - 200) context_end = min(len(text), end_pos + 200) before = ("..." if context_start > 0 else "") + text[context_start:start_pos] dataset = text[start_pos:end_pos] after = text[end_pos:context_end] + ("..." if context_end < len(text) else "") return [ (before, None), (dataset, "DATASET"), (after, None) ] def _is_annotated(self, idx: int) -> bool: """Check if a record has been annotated.""" sample_id = self.records[idx].get('sample_id', idx) return sample_id in self.annotations def _should_skip(self, idx: int) -> bool: """Check if record is a one-word vague/descriptive that should be skipped.""" if idx >= len(self.records): return False record = self.records[idx] text = record.get('text', '') word_count = len(text.split()) ext_tag = record.get('extraction_tag', '') judge_tag = record.get('judge_tag', '') # Skip one-word vague/descriptive mentions skip_tags = {'vague', 'descriptive'} if word_count == 1 and (ext_tag in skip_tags or judge_tag in skip_tags): return True return False def _find_next_unannotated(self): """Find the next unannotated record (skipping one-word vague/descriptive).""" for i in range(len(self.records)): if not self._is_annotated(i) and not self._should_skip(i): self.current_idx = i return # All annotated or skippable self.current_idx = len(self.records) def get_current_display(self) -> Tuple[str, list, str, str, str, str, Dict]: """Get current record for display.""" if self.current_idx >= len(self.records): return "🎉 All samples validated!", [], "", "", f"Progress: {len(self.annotations)}/{len(self.records)} (100%)", "✅ Complete", {} record = self.records[self.current_idx] # Get context with highlighting context = self._extract_context( record.get('full_context', '') or record.get('usage_context', ''), record['text'] ) # Build AI verdicts (Judge only - no 4o in direct_judge) # Dataset = any tag that is NOT "non-dataset" (includes named, descriptive, vague) ai_verdicts_str = "" # Extraction model verdict # Dataset if tag is NOT "non-dataset" ext_tag = record['extraction_tag'] ext_is_dataset = ext_tag != 'non-dataset' ext_emoji = "✓" if ext_is_dataset else "✗" ai_verdicts_str = f"### 🤖 Extraction Model:\n" ai_verdicts_str += f"**Verdict:** {ext_emoji} {'Dataset' if ext_is_dataset else 'Non-Dataset'}\n" ai_verdicts_str += f"**Tag:** `{ext_tag}`\n" ai_verdicts_str += f"**Confidence:** {record['extraction_confidence']:.1%}\n" # Judge (GPT-5.2) verdict # Dataset if tag is NOT "non-dataset" judge_tag = record['judge_tag'] judge_is_dataset = judge_tag != 'non-dataset' judge_emoji = "✓" if judge_is_dataset else "✗" ai_verdicts_str += f"\n### 🧑‍⚖️ Judge (GPT-5.2):\n" ai_verdicts_str += f"**Verdict:** {judge_emoji} {'Dataset' if judge_is_dataset else 'Non-Dataset'}\n" ai_verdicts_str += f"**Tag:** `{judge_tag}`\n" ai_verdicts_str += f"**Confidence:** {record['judge_confidence']:.1%}\n" if record.get('judge_data_type'): ai_verdicts_str += f"**Data Type:** {record['judge_data_type']}\n" if record.get('judge_reasoning'): reasoning = record['judge_reasoning'][:300] ai_verdicts_str += f"\n*Reasoning:* {reasoning}..." # Metadata metadata_parts = [] metadata_parts.append(f"**Stratum:** `{record['stratum']}`") metadata_parts.append(f"**Document:** `{record['document'][:50]}...`") is_primary = record.get('is_primary', True) metadata_parts.append(f"**Type:** {'Primary sample' if is_primary else 'Sibling (same chunk)'}") if record.get('geography'): geo = record['geography'] if isinstance(geo, dict): geo = geo.get('text', str(geo)) metadata_parts.append(f"**Geography:** {geo}") metadata_str = "\n".join(metadata_parts) # Get chunk info chunk_num, ds_in_chunk, total_in_chunk = self._get_chunk_info(self.current_idx) # Progress: N/N-max datasets annotated = len(self.annotations) progress = f"Datasets: {annotated}/{self.total_datasets} ({annotated/self.total_datasets*100:.1f}%)" # Status is_annotated = self._is_annotated(self.current_idx) if is_annotated: ann = self.annotations.get(record.get('sample_id', self.current_idx), {}) status = f"✅ Validated as: {ann.get('human_verdict', 'unknown')}" else: status = "❓ Pending Validation" # Navigation info with chunk details nav = { 'chunk_info': f"Input Text: {chunk_num}/{self.total_chunks}", 'dataset_in_chunk': f"Dataset: {ds_in_chunk}/{total_in_chunk} in this chunk", 'record_info': f"Overall: {self.current_idx + 1}/{self.total_datasets}", 'can_prev': self.current_idx > 0, 'can_next': self.current_idx < self.total_datasets - 1 } return record['text'], context, metadata_str, ai_verdicts_str, progress, status, nav def annotate(self, verdict: str, notes: str = "") -> Tuple[str, list, str, str, str, str]: """Annotate current record and move to next.""" if self.current_idx < len(self.records): record = self.records[self.current_idx] sample_id = record.get('sample_id', self.current_idx) self._save_annotation(sample_id, verdict, notes) self.next_record() return self.get_current_display()[:6] def next_record(self): """Move to next record.""" if self.current_idx < len(self.records) - 1: self.current_idx += 1 def prev_record(self): """Move to previous record.""" if self.current_idx > 0: self.current_idx -= 1 def skip_to_next_unannotated(self): """Skip to next unannotated record (also skipping one-word vague/descriptive).""" for i in range(self.current_idx + 1, len(self.records)): if not self._is_annotated(i) and not self._should_skip(i): self.current_idx = i return def get_statistics(self) -> str: """Get current annotation statistics as markdown.""" if not self.annotations: return "_No annotations yet_" total = len(self.annotations) human_dataset = sum(1 for a in self.annotations.values() if a['human_verdict'] == 'dataset') human_non = total - human_dataset agrees_ext = sum(1 for a in self.annotations.values() if a['human_agrees_extraction']) agrees_judge = sum(1 for a in self.annotations.values() if a['human_agrees_judge']) stats = f"""**Annotated:** {total}/{len(self.records)} **Human Verdicts:** - Dataset: {human_dataset} - Non-Dataset: {human_non} **Agreement Rates:** - Extraction Model: {agrees_ext/total*100:.1f}% - Judge (GPT-5.2): {agrees_judge/total*100:.1f}% """ return stats def create_app(input_file: str): """Create and configure Gradio app.""" annotator = ValidationAnnotator(input_file) # Custom CSS for the green button and dark mode toggle css = """ #accept_btn { background-color: #22c55e !important; color: white !important; } #accept_btn:hover { background-color: #16a34a !important; } #theme_toggle { position: fixed; top: 10px; right: 10px; z-index: 1000; padding: 8px 16px; border-radius: 20px; cursor: pointer; font-size: 14px; } """ # JavaScript for dark mode toggle js = """ function toggleDarkMode() { const body = document.body; const isDark = body.classList.contains('dark'); if (isDark) { body.classList.remove('dark'); localStorage.setItem('theme', 'light'); document.getElementById('theme_toggle').textContent = '🌙 Dark Mode'; } else { body.classList.add('dark'); localStorage.setItem('theme', 'dark'); document.getElementById('theme_toggle').textContent = '☀️ Light Mode'; } } // Apply saved theme on load document.addEventListener('DOMContentLoaded', function() { const savedTheme = localStorage.getItem('theme'); if (savedTheme === 'dark') { document.body.classList.add('dark'); const btn = document.getElementById('theme_toggle'); if (btn) btn.textContent = '☀️ Light Mode'; } }); """ with gr.Blocks(title="Dataset Annotation Tool", css=css, js=js) as app: # Theme toggle button gr.HTML('') gr.Markdown("# 📊 Dataset Annotation Tool") gr.Markdown("Review and annotate dataset mentions. Each annotation is saved in real-time.") with gr.Row(): with gr.Column(scale=2): dataset_name = gr.Textbox(label="Dataset Name", interactive=False, max_lines=2) context_box = gr.HighlightedText( label="Context (±1 sentence, dataset highlighted)", color_map={"DATASET": "yellow"}, show_legend=False, combine_adjacent=True ) metadata_box = gr.Markdown(label="Metadata") show_ai_checkbox = gr.Checkbox(label="🤖 Show what the AI thinks", value=False) ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False) with gr.Column(scale=1): progress_box = gr.Textbox(label="Progress", interactive=False, lines=1) chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1) dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1) status_box = gr.Textbox(label="Status", interactive=False, lines=1) notes_box = gr.Textbox( label="Notes (optional)", placeholder="Add any comments about this dataset...", lines=3 ) with gr.Row(): accept_btn = gr.Button("✓ DATASET", variant="primary", size="lg", elem_id="accept_btn") reject_btn = gr.Button("✗ NOT A DATASET", variant="stop", size="lg") gr.Markdown("---") with gr.Row(): prev_btn = gr.Button("← Previous", size="sm") next_btn = gr.Button("Next →", size="sm") skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm") gr.Markdown("---") with gr.Accordion("📊 Live Statistics", open=True): stats_box = gr.Markdown() gr.Markdown("---") gr.Markdown(f"**Input:** `{Path(input_file).name}`") gr.Markdown(f"**Output:** `{annotator.output_file.name}`") nav_state = gr.State({}) def update_display(): name, context, metadata, ai_verdicts, progress, status, nav = annotator.get_current_display() chunk_info = nav.get('chunk_info', '') dataset_in_chunk = nav.get('dataset_in_chunk', '') stats = annotator.get_statistics() return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats def accept_and_next(notes): name, context, metadata, ai_verdicts, progress, status = annotator.annotate('dataset', notes) _, _, _, _, _, _, nav = annotator.get_current_display() chunk_info = nav.get('chunk_info', '') dataset_in_chunk = nav.get('dataset_in_chunk', '') stats = annotator.get_statistics() return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats def reject_and_next(notes): name, context, metadata, ai_verdicts, progress, status = annotator.annotate('non-dataset', notes) _, _, _, _, _, _, nav = annotator.get_current_display() chunk_info = nav.get('chunk_info', '') dataset_in_chunk = nav.get('dataset_in_chunk', '') stats = annotator.get_statistics() return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats def go_next(): annotator.next_record() return update_display() def go_prev(): annotator.prev_record() return update_display() def skip_unannotated(): annotator.skip_to_next_unannotated() return update_display() def toggle_ai_verdicts(show_ai): return gr.update(visible=show_ai) # Outputs - updated with chunk_info and dataset_in_chunk outputs_list = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, nav_state, stats_box] outputs_annotate = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, notes_box, nav_state, stats_box] accept_btn.click(accept_and_next, inputs=[notes_box], outputs=outputs_annotate) reject_btn.click(reject_and_next, inputs=[notes_box], outputs=outputs_annotate) next_btn.click(go_next, outputs=outputs_list) prev_btn.click(go_prev, outputs=outputs_list) skip_btn.click(skip_unannotated, outputs=outputs_list) show_ai_checkbox.change(toggle_ai_verdicts, inputs=[show_ai_checkbox], outputs=[ai_verdicts_box]) app.load(update_display, outputs=outputs_list) return app def main(): parser = argparse.ArgumentParser(description="Validation annotation Gradio app") parser.add_argument( "--input", type=str, default="/Users/rafaelmacalaba/WBG/monitoring_of_datause/revalidation/analysis/unhcr_reliefweb/validation/validation_sample.jsonl", help="Input JSONL file with validation samples" ) parser.add_argument( "--share", action="store_true", help="Create a public share link" ) parser.add_argument( "--port", type=int, default=7860, help="Port to run the app on (default: 7860)" ) args = parser.parse_args() if not Path(args.input).exists(): print(f"Error: Input file not found: {args.input}") print("\nRun the sampling script first:") print(" python sample_for_validation.py") return app = create_app(args.input) app.launch(share=args.share, server_port=args.port) if __name__ == "__main__": main()