rafmacalaba commited on
Commit
64b4b2f
Β·
1 Parent(s): b040870

add pdf viewer

Browse files
Files changed (2) hide show
  1. app.py +252 -64
  2. upload_pdfs.py +133 -0
app.py CHANGED
@@ -16,7 +16,17 @@ import gradio as gr
16
  import json
17
  import re
18
  import os
 
19
  from pathlib import Path
 
 
 
 
 
 
 
 
 
20
  from typing import Dict, List, Tuple, Optional
21
  from datetime import datetime
22
  from huggingface_hub import HfApi, login
@@ -31,13 +41,20 @@ class ValidationAnnotator:
31
  No 4o data available - only judge (GPT-5.2) verdicts are shown.
32
  """
33
 
34
- def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None):
 
35
  self.input_file = Path(input_file)
36
  self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl"
37
 
38
  # HF Datasets integration
39
  self.hf_dataset_repo = hf_dataset_repo
40
  self.hf_token = hf_token or os.getenv("HF_TOKEN")
 
 
 
 
 
 
41
  self.hf_enabled = False
42
 
43
  # Try to enable HF Datasets if credentials provided
@@ -296,19 +313,16 @@ class ValidationAnnotator:
296
  # Show all records including siblings
297
  self.filtered_indices = list(range(len(self.records)))
298
  else:
299
- # Filter by extraction_tag OR judge_tag matching the filter
300
  # AND exclude siblings (only show primary samples)
301
  self.filtered_indices = [
302
  i for i, record in enumerate(self.records)
303
- if (record.get('extraction_tag') == filter_value or record.get('judge_tag') == filter_value)
304
  and record.get('is_primary', True) # Only primary samples, not siblings
305
  ]
306
 
307
- # Reset to first filtered record if current position is not in filtered set
308
- if self.current_idx not in self.filtered_indices and self.filtered_indices:
309
- self.current_idx = self.filtered_indices[0]
310
- elif not self.filtered_indices:
311
- self.current_idx = len(self.records) # No matching records
312
 
313
  def _is_annotated(self, idx: int) -> bool:
314
  """Check if a record has been annotated."""
@@ -332,18 +346,28 @@ class ValidationAnnotator:
332
  return False
333
 
334
  def _find_next_unannotated(self):
335
- """Find the next unannotated record (skipping one-word vague/descriptive)."""
336
- for i in range(len(self.records)):
337
- if not self._is_annotated(i) and not self._should_skip(i):
338
- self.current_idx = i
 
 
 
 
339
  return
340
- # All annotated or skippable
341
- self.current_idx = len(self.records)
 
 
 
 
 
 
342
 
343
- def get_current_display(self) -> Tuple[str, list, str, str, str, str, Dict]:
344
  """Get current record for display."""
345
  if self.current_idx >= len(self.records):
346
- return "πŸŽ‰ All samples validated!", [], "", "", f"Progress: {len(self.annotations)}/{len(self.records)} (100%)", "βœ… Complete", {}
347
 
348
  record = self.records[self.current_idx]
349
 
@@ -379,20 +403,26 @@ class ValidationAnnotator:
379
  if record.get('judge_data_type'):
380
  ai_verdicts_str += f"**Data Type:** {record['judge_data_type']}\n"
381
  if record.get('judge_reasoning'):
382
- reasoning = record['judge_reasoning'][:300]
383
  ai_verdicts_str += f"\n*Reasoning:* {reasoning}..."
384
 
 
385
  # Metadata
386
  metadata_parts = []
387
- metadata_parts.append(f"**Stratum:** `{record['stratum']}`")
388
- metadata_parts.append(f"**Document:** `{record['document'][:50]}...`")
 
 
 
 
 
389
  is_primary = record.get('is_primary', True)
390
- metadata_parts.append(f"**Type:** {'Primary sample' if is_primary else 'Sibling (same chunk)'}")
391
  if record.get('geography'):
392
  geo = record['geography']
393
  if isinstance(geo, dict):
394
  geo = geo.get('text', str(geo))
395
- metadata_parts.append(f"**Geography:** {geo}")
396
  metadata_str = "\n".join(metadata_parts)
397
 
398
  # Get chunk info
@@ -419,7 +449,38 @@ class ValidationAnnotator:
419
  'can_next': self.current_idx < self.total_datasets - 1
420
  }
421
 
422
- return record['text'], context, metadata_str, ai_verdicts_str, progress, status, nav
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
  def annotate(self, verdict: str, notes: str = "") -> Tuple[str, list, str, str, str, str]:
425
  """Annotate current record and move to next."""
@@ -431,14 +492,30 @@ class ValidationAnnotator:
431
  return self.get_current_display()[:6]
432
 
433
  def next_record(self):
434
- """Move to next record."""
435
- if self.current_idx < len(self.records) - 1:
436
- self.current_idx += 1
 
 
 
 
 
 
 
 
437
 
438
  def prev_record(self):
439
- """Move to previous record."""
440
- if self.current_idx > 0:
441
- self.current_idx -= 1
 
 
 
 
 
 
 
 
442
 
443
  def skip_to_next_unannotated(self):
444
  """Skip to next unannotated record (also skipping one-word vague/descriptive)."""
@@ -471,9 +548,10 @@ class ValidationAnnotator:
471
  return stats
472
 
473
 
474
- def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None):
 
475
  """Create and configure Gradio app."""
476
- annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token)
477
 
478
  # Custom CSS for the green button and dark mode toggle
479
  css = """
@@ -520,6 +598,20 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
520
  const btn = document.getElementById('theme_toggle');
521
  if (btn) btn.textContent = 'β˜€οΈ Light Mode';
522
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  });
524
  """
525
 
@@ -606,25 +698,35 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
606
  **Ready to start?** Click the **"Annotate"** tab above to begin!
607
  """)
608
 
 
 
 
 
 
 
609
  # Tab 2: Annotation Interface
610
  with gr.Tab("✏️ Annotate") as annotate_tab:
611
- gr.Markdown("Review and annotate dataset mentions. Each annotation is saved in real-time.")
612
 
 
613
  with gr.Row():
614
- with gr.Column(scale=2):
615
- dataset_name = gr.Textbox(label="Dataset Name", interactive=False, max_lines=2)
 
616
  context_box = gr.HighlightedText(
617
  label="Context (Β±2 sentences, dataset highlighted)",
 
618
  color_map={"DATASET": "yellow"},
619
  show_legend=False,
620
  combine_adjacent=True
621
  )
622
- metadata_box = gr.Markdown(label="Metadata")
623
 
624
  show_ai_checkbox = gr.Checkbox(label="πŸ€– Show what the AI thinks", value=False)
625
- ai_verdicts_box = gr.Markdown(label="AI Analysis", visible=False)
626
 
627
- with gr.Column(scale=1):
 
628
  # Filter dropdown
629
  filter_dropdown = gr.Dropdown(
630
  choices=["All", "named", "descriptive", "vague", "non-dataset"],
@@ -633,10 +735,10 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
633
  interactive=True
634
  )
635
 
636
- progress_box = gr.Textbox(label="Progress", interactive=False, lines=1)
637
- chunk_info_box = gr.Textbox(label="Input Text Position", interactive=False, lines=1)
638
- dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", interactive=False, lines=1)
639
- status_box = gr.Textbox(label="Status", interactive=False, lines=1)
640
 
641
  notes_box = gr.Textbox(
642
  label="Notes (optional)",
@@ -656,12 +758,8 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
656
 
657
  skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm")
658
 
659
- gr.Markdown("---")
660
-
661
- with gr.Accordion("πŸ“Š Live Statistics", open=True):
662
- stats_box = gr.Markdown()
663
-
664
- gr.Markdown("---")
665
 
666
  # Download button for manual backup
667
  download_btn = gr.DownloadButton(
@@ -675,36 +773,72 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
675
  if annotator.hf_enabled:
676
  gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
677
  else:
678
- gr.Markdown("⚠️ **Auto-backup disabled** (set HF_TOKEN secret to enable)")
679
 
680
- gr.Markdown("---")
681
  gr.Markdown(f"**Input:** `{Path(input_file).name}`")
682
- gr.Markdown(f"**Output:** `{annotator.output_file.name}`")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
 
684
  nav_state = gr.State({})
685
 
686
  def update_display():
687
- name, context, metadata, ai_verdicts, progress, status, nav = annotator.get_current_display()
 
688
  chunk_info = nav.get('chunk_info', '')
689
  dataset_in_chunk = nav.get('dataset_in_chunk', '')
690
  stats = annotator.get_statistics()
691
- return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats
 
 
 
 
 
 
692
 
693
  def accept_and_next(notes):
694
  name, context, metadata, ai_verdicts, progress, status = annotator.annotate('dataset', notes)
695
- _, _, _, _, _, _, nav = annotator.get_current_display()
696
  chunk_info = nav.get('chunk_info', '')
697
  dataset_in_chunk = nav.get('dataset_in_chunk', '')
698
  stats = annotator.get_statistics()
699
- return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats
 
 
 
 
700
 
701
  def reject_and_next(notes):
702
  name, context, metadata, ai_verdicts, progress, status = annotator.annotate('non-dataset', notes)
703
- _, _, _, _, _, _, nav = annotator.get_current_display()
704
  chunk_info = nav.get('chunk_info', '')
705
  dataset_in_chunk = nav.get('dataset_in_chunk', '')
706
  stats = annotator.get_statistics()
707
- return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats
 
 
 
 
708
 
709
  def go_next():
710
  annotator.next_record()
@@ -721,7 +855,8 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
721
  def toggle_ai_verdicts(show_ai):
722
  if show_ai:
723
  # Get current AI verdicts content
724
- _, _, _, ai_verdicts, _, _, _ = annotator.get_current_display()
 
725
  return gr.update(visible=True, value=ai_verdicts)
726
  return gr.update(visible=False)
727
 
@@ -732,8 +867,11 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
732
  return None
733
 
734
  # Outputs - updated with chunk_info and dataset_in_chunk
735
- outputs_list = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, nav_state, stats_box]
736
- outputs_annotate = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, notes_box, nav_state, stats_box]
 
 
 
737
 
738
  accept_btn.click(accept_and_next, inputs=[notes_box], outputs=outputs_annotate).then(
739
  get_download_file, outputs=[download_btn]
@@ -752,19 +890,44 @@ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token:
752
  filter_dropdown.change(apply_filter, inputs=[filter_dropdown], outputs=outputs_list)
753
  show_ai_checkbox.change(toggle_ai_verdicts, inputs=[show_ai_checkbox], outputs=[ai_verdicts_box])
754
 
755
- # Load data when app starts AND when tab is selected
756
- app.load(update_display, outputs=outputs_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
  annotate_tab.select(update_display, outputs=outputs_list)
 
 
758
 
759
  return app
760
 
761
 
762
  # For Hugging Face Spaces deployment
763
  if __name__ == "__main__":
764
- # Use the data file in the repository
765
- input_file = "validation_sample_filtering_retained.jsonl"
 
 
 
 
 
 
766
 
 
 
767
  # Check if file exists
 
768
  if not Path(input_file).exists():
769
  raise FileNotFoundError(
770
  f"Input file '{input_file}' not found. "
@@ -775,6 +938,31 @@ if __name__ == "__main__":
775
  hf_dataset_repo = os.getenv("HF_DATASET_REPO") # e.g., "username/reliefweb-annotations"
776
  hf_token = os.getenv("HF_TOKEN") # HF write token
777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
  # Create and launch the app
779
- app = create_app(input_file, hf_dataset_repo, hf_token)
780
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
16
  import json
17
  import re
18
  import os
19
+ import argparse
20
  from pathlib import Path
21
+ from dotenv import load_dotenv
22
+
23
+ # Load .env for local development
24
+ load_dotenv()
25
+
26
+ try:
27
+ from gradio_pdf import PDF as gr_pdf
28
+ except ImportError:
29
+ gr_pdf = None
30
  from typing import Dict, List, Tuple, Optional
31
  from datetime import datetime
32
  from huggingface_hub import HfApi, login
 
41
  No 4o data available - only judge (GPT-5.2) verdicts are shown.
42
  """
43
 
44
+ def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
45
+ pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None):
46
  self.input_file = Path(input_file)
47
  self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl"
48
 
49
  # HF Datasets integration
50
  self.hf_dataset_repo = hf_dataset_repo
51
  self.hf_token = hf_token or os.getenv("HF_TOKEN")
52
+
53
+ # PDF configuration
54
+ self.pdf_dir = Path(pdf_dir) if pdf_dir else None
55
+ self.pdf_url_base = pdf_url_base
56
+ if self.pdf_dir and not self.pdf_dir.exists():
57
+ print(f"⚠️ PDF directory not found: {self.pdf_dir}")
58
  self.hf_enabled = False
59
 
60
  # Try to enable HF Datasets if credentials provided
 
313
  # Show all records including siblings
314
  self.filtered_indices = list(range(len(self.records)))
315
  else:
316
+ # Filter by extraction_tag only (not judge_tag)
317
  # AND exclude siblings (only show primary samples)
318
  self.filtered_indices = [
319
  i for i, record in enumerate(self.records)
320
+ if record.get('extraction_tag') == filter_value
321
  and record.get('is_primary', True) # Only primary samples, not siblings
322
  ]
323
 
324
+ # Always jump to first unannotated record in the new filtered set for determinism
325
+ self._find_next_unannotated()
 
 
 
326
 
327
  def _is_annotated(self, idx: int) -> bool:
328
  """Check if a record has been annotated."""
 
346
  return False
347
 
348
  def _find_next_unannotated(self):
349
+ """Find the next unannotated record within the current filtered set."""
350
+ if not self.filtered_indices:
351
+ self.current_idx = len(self.records)
352
+ return
353
+
354
+ for idx in self.filtered_indices:
355
+ if not self._is_annotated(idx) and not self._should_skip(idx):
356
+ self.current_idx = idx
357
  return
358
+
359
+ # All filtered records are annotated or skippable, go to the first filtered one if we have any
360
+ # or stick to the end if we want to show the completion screen.
361
+ # Actually, let's go to the last filtered one if all are annotated.
362
+ if self.filtered_indices:
363
+ self.current_idx = self.filtered_indices[0]
364
+ else:
365
+ self.current_idx = len(self.records)
366
 
367
+ def get_current_display(self) -> Tuple[str, list, str, str, str, str, Dict, str]:
368
  """Get current record for display."""
369
  if self.current_idx >= len(self.records):
370
+ return "πŸŽ‰ All samples validated!", [], "", "", f"Progress: {len(self.annotations)}/{len(self.records)} (100%)", "βœ… Complete", {}, ""
371
 
372
  record = self.records[self.current_idx]
373
 
 
403
  if record.get('judge_data_type'):
404
  ai_verdicts_str += f"**Data Type:** {record['judge_data_type']}\n"
405
  if record.get('judge_reasoning'):
406
+ reasoning = record['judge_reasoning']
407
  ai_verdicts_str += f"\n*Reasoning:* {reasoning}..."
408
 
409
+ # Metadata
410
  # Metadata
411
  metadata_parts = []
412
+ metadata_parts.append(f"- **Stratum:** `{record['stratum']}`")
413
+ # metadata_parts.append(f"- **Document:** `{record['document']}...`")
414
+ if record.get("source_document"):
415
+ metadata_parts.append(f"- **Source File:** `{record.get('source_document')}`")
416
+ if record.get("page_number"):
417
+ metadata_parts.append(f"- **Page(s):** {record.get('page_number')}")
418
+
419
  is_primary = record.get('is_primary', True)
420
+ metadata_parts.append(f"- **Type:** {'Primary sample' if is_primary else 'Sibling (same chunk)'}")
421
  if record.get('geography'):
422
  geo = record['geography']
423
  if isinstance(geo, dict):
424
  geo = geo.get('text', str(geo))
425
+ metadata_parts.append(f"- **Geography:** {geo}")
426
  metadata_str = "\n".join(metadata_parts)
427
 
428
  # Get chunk info
 
449
  'can_next': self.current_idx < self.total_datasets - 1
450
  }
451
 
452
+ # PDF Source path and page
453
+ source_doc = record.get("source_document")
454
+ page_num = record.get("page_number")
455
+ pdf_value = None
456
+
457
+ # Convert page_num to int and add 1 (offset from 0-indexed data)
458
+ try:
459
+ if page_num:
460
+ page_num = int(page_num) + 1
461
+ else:
462
+ page_num = 1
463
+ except (ValueError, TypeError):
464
+ page_num = 1
465
+
466
+ if source_doc and self.pdf_dir:
467
+ # Local PDF directory
468
+ pdf_path = self.pdf_dir / source_doc
469
+ if pdf_path.exists():
470
+ pdf_value = str(pdf_path.absolute())
471
+ print(f"πŸ“„ Found PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True)
472
+ else:
473
+ print(f"⚠️ PDF file not found: {pdf_path}", flush=True)
474
+ elif source_doc and self.pdf_url_base:
475
+ # Remote PDF via URL (e.g., HF Datasets)
476
+ # Remove any leading slashes from source_doc
477
+ source_doc_clean = source_doc.lstrip('/')
478
+ pdf_value = f"{self.pdf_url_base.rstrip('/')}/{source_doc_clean}"
479
+ print(f"🌐 Using remote PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True)
480
+ elif source_doc:
481
+ print(f"ℹ️ PDF source specified ({source_doc}) but no pdf_dir or pdf_url_base provided.", flush=True)
482
+
483
+ return record['text'], context, metadata_str, ai_verdicts_str, progress, status, nav, pdf_value, page_num
484
 
485
  def annotate(self, verdict: str, notes: str = "") -> Tuple[str, list, str, str, str, str]:
486
  """Annotate current record and move to next."""
 
492
  return self.get_current_display()[:6]
493
 
494
  def next_record(self):
495
+ """Move to next record in the filtered set."""
496
+ if not self.filtered_indices:
497
+ return
498
+
499
+ try:
500
+ current_pos = self.filtered_indices.index(self.current_idx)
501
+ if current_pos < len(self.filtered_indices) - 1:
502
+ self.current_idx = self.filtered_indices[current_pos + 1]
503
+ except ValueError:
504
+ # Current idx not in filtered set (maybe filter changed), jump to first
505
+ self.current_idx = self.filtered_indices[0]
506
 
507
  def prev_record(self):
508
+ """Move to previous record in the filtered set."""
509
+ if not self.filtered_indices:
510
+ return
511
+
512
+ try:
513
+ current_pos = self.filtered_indices.index(self.current_idx)
514
+ if current_pos > 0:
515
+ self.current_idx = self.filtered_indices[current_pos - 1]
516
+ except ValueError:
517
+ # Current idx not in filtered set, jump to first
518
+ self.current_idx = self.filtered_indices[0]
519
 
520
  def skip_to_next_unannotated(self):
521
  """Skip to next unannotated record (also skipping one-word vague/descriptive)."""
 
548
  return stats
549
 
550
 
551
+ def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
552
+ pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None):
553
  """Create and configure Gradio app."""
554
+ annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base)
555
 
556
  # Custom CSS for the green button and dark mode toggle
557
  css = """
 
598
  const btn = document.getElementById('theme_toggle');
599
  if (btn) btn.textContent = 'β˜€οΈ Light Mode';
600
  }
601
+
602
+ // Force resize when switching to Annotate tab to help PDF viewer
603
+ document.body.addEventListener('click', function(e) {
604
+ if (e.target && e.target.innerText && e.target.innerText.includes('Annotate')) {
605
+ console.log('Annotate tab clicked - forcing resize');
606
+ setTimeout(() => {
607
+ window.dispatchEvent(new Event('resize'));
608
+ // Also try to find any canvases and nudge them
609
+ document.querySelectorAll('canvas').forEach(c => {
610
+ c.dispatchEvent(new Event('resize'));
611
+ });
612
+ }, 500);
613
+ }
614
+ }, true);
615
  });
616
  """
617
 
 
698
  **Ready to start?** Click the **"Annotate"** tab above to begin!
699
  """)
700
 
701
+ # Get initial values for robust first render
702
+ init_name, init_context, init_metadata, init_ai, init_progress, init_status, init_nav, init_pdf, init_page = annotator.get_current_display()
703
+ init_chunk_info = init_nav.get('chunk_info', '')
704
+ init_dataset_in_chunk = init_nav.get('dataset_in_chunk', '')
705
+ init_stats = annotator.get_statistics()
706
+
707
  # Tab 2: Annotation Interface
708
  with gr.Tab("✏️ Annotate") as annotate_tab:
709
+ gr.Markdown("Review and annotate dataset mentions. PDF viewer is below for reference.")
710
 
711
+ # Top Section: Annotation Controls
712
  with gr.Row():
713
+ # Dataset Info & Context
714
+ with gr.Column(scale=3):
715
+ dataset_name = gr.Textbox(label="Dataset Name", value=init_name, interactive=False, max_lines=2)
716
  context_box = gr.HighlightedText(
717
  label="Context (Β±2 sentences, dataset highlighted)",
718
+ value=init_context,
719
  color_map={"DATASET": "yellow"},
720
  show_legend=False,
721
  combine_adjacent=True
722
  )
723
+ metadata_box = gr.Markdown(init_metadata, label="Metadata")
724
 
725
  show_ai_checkbox = gr.Checkbox(label="πŸ€– Show what the AI thinks", value=False)
726
+ ai_verdicts_box = gr.Markdown(init_ai, label="AI Analysis", visible=False)
727
 
728
+ # Controls & Progress
729
+ with gr.Column(scale=2):
730
  # Filter dropdown
731
  filter_dropdown = gr.Dropdown(
732
  choices=["All", "named", "descriptive", "vague", "non-dataset"],
 
735
  interactive=True
736
  )
737
 
738
+ progress_box = gr.Textbox(label="Progress", value=init_progress, interactive=False, lines=1)
739
+ chunk_info_box = gr.Textbox(label="Input Text Position", value=init_chunk_info, interactive=False, lines=1)
740
+ dataset_in_chunk_box = gr.Textbox(label="Dataset in Chunk", value=init_dataset_in_chunk, interactive=False, lines=1)
741
+ status_box = gr.Textbox(label="Status", value=init_status, interactive=False, lines=1)
742
 
743
  notes_box = gr.Textbox(
744
  label="Notes (optional)",
 
758
 
759
  skip_btn = gr.Button("⏭️ Skip to Next Unannotated", size="sm")
760
 
761
+ with gr.Accordion("πŸ“Š Live Statistics", open=False):
762
+ stats_box = gr.Markdown(init_stats)
 
 
 
 
763
 
764
  # Download button for manual backup
765
  download_btn = gr.DownloadButton(
 
773
  if annotator.hf_enabled:
774
  gr.Markdown(f"☁️ **Auto-backup enabled:** [{annotator.hf_dataset_repo}](https://huggingface.co/datasets/{annotator.hf_dataset_repo})")
775
  else:
776
+ gr.Markdown("⚠️ **Auto-backup disabled**")
777
 
 
778
  gr.Markdown(f"**Input:** `{Path(input_file).name}`")
779
+
780
+ gr.Markdown("---")
781
+
782
+ # Bottom Section: PDF Viewer (Full Width)
783
+ with gr.Row():
784
+ with gr.Column(scale=1):
785
+ if gr_pdf is None:
786
+ gr.Markdown("### ⚠️ `gradio-pdf` not found\nPlease run `uv pip install gradio-pdf` and restart.")
787
+ pdf_viewer = gr.HTML(visible=False)
788
+ else:
789
+ # Use gradio-pdf component
790
+ pdf_viewer = gr_pdf(
791
+ label="Source Document",
792
+ height=1000,
793
+ visible=True
794
+ )
795
+
796
+ refresh_pdf_btn = gr.Button("πŸ”„ Reload PDF Viewer", size="sm")
797
+
798
+ # Hidden PDF component to authorize file serving
799
+ if annotator.pdf_dir:
800
+ gr.File(value=None, visible=False, interactive=False)
801
+
802
 
803
  nav_state = gr.State({})
804
 
805
  def update_display():
806
+ print(f"πŸ“‘ Updating display for index {annotator.current_idx}...", flush=True)
807
+ name, context, metadata, ai_verdicts, progress, status, nav, pdf_path, page_num = annotator.get_current_display()
808
  chunk_info = nav.get('chunk_info', '')
809
  dataset_in_chunk = nav.get('dataset_in_chunk', '')
810
  stats = annotator.get_statistics()
811
+
812
+ # Use gr.update for gradio_pdf component
813
+ pdf_update = gr.update(value=pdf_path, starting_page=page_num)
814
+ print(f"πŸ–ΌοΈ PDF Update: path={pdf_path}, page={page_num}", flush=True)
815
+
816
+ return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats, pdf_update
817
+
818
 
819
  def accept_and_next(notes):
820
  name, context, metadata, ai_verdicts, progress, status = annotator.annotate('dataset', notes)
821
+ _, _, _, _, _, _, nav, pdf_value, page_num = annotator.get_current_display()
822
  chunk_info = nav.get('chunk_info', '')
823
  dataset_in_chunk = nav.get('dataset_in_chunk', '')
824
  stats = annotator.get_statistics()
825
+
826
+ # Use gr.update for gradio_pdf component
827
+ pdf_update = gr.update(value=pdf_value, starting_page=page_num)
828
+
829
+ return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats, pdf_update
830
 
831
  def reject_and_next(notes):
832
  name, context, metadata, ai_verdicts, progress, status = annotator.annotate('non-dataset', notes)
833
+ _, _, _, _, _, _, nav, pdf_value, page_num = annotator.get_current_display()
834
  chunk_info = nav.get('chunk_info', '')
835
  dataset_in_chunk = nav.get('dataset_in_chunk', '')
836
  stats = annotator.get_statistics()
837
+
838
+ # Use gr.update for gradio_pdf component
839
+ pdf_update = gr.update(value=pdf_value, starting_page=page_num)
840
+
841
+ return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, "", nav, stats, pdf_update
842
 
843
  def go_next():
844
  annotator.next_record()
 
855
  def toggle_ai_verdicts(show_ai):
856
  if show_ai:
857
  # Get current AI verdicts content
858
+ display_data = annotator.get_current_display()
859
+ ai_verdicts = display_data[3] # ai_verdicts_str is the 4th value
860
  return gr.update(visible=True, value=ai_verdicts)
861
  return gr.update(visible=False)
862
 
 
867
  return None
868
 
869
  # Outputs - updated with chunk_info and dataset_in_chunk
870
+
871
+
872
+ # Outputs - updated with chunk_info and dataset_in_chunk
873
+ outputs_list = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, nav_state, stats_box, pdf_viewer]
874
+ outputs_annotate = [dataset_name, context_box, metadata_box, ai_verdicts_box, progress_box, chunk_info_box, dataset_in_chunk_box, status_box, notes_box, nav_state, stats_box, pdf_viewer]
875
 
876
  accept_btn.click(accept_and_next, inputs=[notes_box], outputs=outputs_annotate).then(
877
  get_download_file, outputs=[download_btn]
 
890
  filter_dropdown.change(apply_filter, inputs=[filter_dropdown], outputs=outputs_list)
891
  show_ai_checkbox.change(toggle_ai_verdicts, inputs=[show_ai_checkbox], outputs=[ai_verdicts_box])
892
 
893
+ def initial_load_no_pdf():
894
+ """Initial load without PDF to avoid the blank page bug on first render.
895
+ The PDF will be loaded when the user first clicks the Annotate tab."""
896
+ print("πŸš€ Initial app load - PDF set to None (will load on tab select)", flush=True)
897
+ name, context, metadata, ai_verdicts, progress, status, nav, pdf_path, page_num = annotator.get_current_display()
898
+ chunk_info = nav.get('chunk_info', '')
899
+ dataset_in_chunk = nav.get('dataset_in_chunk', '')
900
+ stats = annotator.get_statistics()
901
+ # Return None for PDF to avoid initial render bug
902
+ pdf_update = gr.update(value=None)
903
+ return name, context, metadata, ai_verdicts, progress, chunk_info, dataset_in_chunk, status, nav, stats, pdf_update
904
+
905
+ # Load data when app starts - WITHOUT PDF to avoid blank page bug
906
+ app.load(initial_load_no_pdf, outputs=outputs_list)
907
+
908
+ # When Annotate tab is selected, load the PDF (this is the "second update" that triggers proper render)
909
  annotate_tab.select(update_display, outputs=outputs_list)
910
+ refresh_pdf_btn.click(update_display, outputs=outputs_list)
911
+
912
 
913
  return app
914
 
915
 
916
  # For Hugging Face Spaces deployment
917
  if __name__ == "__main__":
918
+ # Parse command line arguments
919
+ parser = argparse.ArgumentParser(description="Dataset Annotation Tool")
920
+ parser.add_argument("--input", "-i", type=str, default="validation_sample_filtering_retained.jsonl",
921
+ help="Input JSONL file (default: validation_sample_filtering_retained.jsonl)")
922
+ parser.add_argument("--pdf-dir", "-p", type=str, default=None,
923
+ help="Directory containing local PDF files (optional)")
924
+ parser.add_argument("--pdf-url-base", "-u", type=str, default=None,
925
+ help="Base URL for remote PDFs (if not using local files)")
926
 
927
+ args = parser.parse_args()
928
+
929
  # Check if file exists
930
+ input_file = args.input
931
  if not Path(input_file).exists():
932
  raise FileNotFoundError(
933
  f"Input file '{input_file}' not found. "
 
938
  hf_dataset_repo = os.getenv("HF_DATASET_REPO") # e.g., "username/reliefweb-annotations"
939
  hf_token = os.getenv("HF_TOKEN") # HF write token
940
 
941
+ # Determine PDF source: command-line args take priority, then env vars
942
+ pdf_dir = args.pdf_dir
943
+ pdf_url_base = args.pdf_url_base
944
+
945
+ # If no explicit PDF source, check for HF PDF repo environment variable
946
+ if not pdf_dir and not pdf_url_base:
947
+ hf_pdf_repo = os.getenv("HF_RELIEFWEB_PDFS_REPO") # e.g., "ai4data/reliefweb-pdfs"
948
+ if hf_pdf_repo:
949
+ pdf_url_base = f"https://huggingface.co/datasets/{hf_pdf_repo}/resolve/main/"
950
+ print(f"🌐 Using HF PDF repository: {hf_pdf_repo}", flush=True)
951
+ print(f" PDF URL base: {pdf_url_base}", flush=True)
952
+ else:
953
+ print("⚠️ No PDF source configured. Set --pdf-dir, --pdf-url-base, or HF_RELIEFWEB_PDFS_REPO.", flush=True)
954
+
955
  # Create and launch the app
956
+ app = create_app(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base)
957
+
958
+ # Ensure allowed paths are absolute for Gradio (only needed for local files)
959
+ allowed = []
960
+ if pdf_dir:
961
+ pdf_dir_parent = str(Path(pdf_dir).parent.resolve())
962
+ allowed = [pdf_dir_parent]
963
+ print(f"πŸš€ Launching with allowed_paths: {allowed}", flush=True)
964
+ print(f"πŸ“‚ PDF Directory Check: {Path(pdf_dir).exists()}", flush=True)
965
+ else:
966
+ print("πŸš€ Launching with remote PDF URLs (no local allowed_paths needed)", flush=True)
967
+
968
+ app.launch(allowed_paths=allowed)
upload_pdfs.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Upload PDFs to Hugging Face Datasets.
4
+
5
+ Usage:
6
+ python upload_pdfs.py --repo-id your-username/reliefweb-pdfs --pdf-dir ./unchr_reliefweb_pdfs
7
+
8
+ Options:
9
+ --max-size-mb: Skip files larger than this (default: 50MB)
10
+ --batch-size: Upload in batches of N files (default: 100)
11
+
12
+ Environment:
13
+ HF_TOKEN: Your Hugging Face write token (or use --token flag)
14
+ """
15
+
16
+ import argparse
17
+ import os
18
+ import tempfile
19
+ import shutil
20
+ from pathlib import Path
21
+ from dotenv import load_dotenv
22
+ from huggingface_hub import HfApi, login
23
+
24
+ # Load environment variables from .env file
25
+ load_dotenv()
26
+
27
+
28
+ def upload_pdfs(repo_id: str, pdf_dir: str, token: str = None, private: bool = True,
29
+ max_size_mb: float = 50, batch_size: int = 100):
30
+ """Upload a folder of PDFs to a Hugging Face Dataset repository."""
31
+
32
+ pdf_path = Path(pdf_dir)
33
+ if not pdf_path.exists():
34
+ raise FileNotFoundError(f"PDF directory not found: {pdf_dir}")
35
+
36
+ # Get all PDFs and filter by size
37
+ all_pdfs = list(pdf_path.glob("*.pdf"))
38
+ max_size_bytes = max_size_mb * 1024 * 1024
39
+
40
+ valid_pdfs = []
41
+ skipped_pdfs = []
42
+
43
+ for pdf in all_pdfs:
44
+ size = pdf.stat().st_size
45
+ if size <= max_size_bytes:
46
+ valid_pdfs.append(pdf)
47
+ else:
48
+ skipped_pdfs.append((pdf.name, size / (1024 * 1024)))
49
+
50
+ print(f"πŸ“ Found {len(all_pdfs)} PDF files in {pdf_dir}")
51
+ print(f"βœ… Will upload: {len(valid_pdfs)} files (under {max_size_mb}MB)")
52
+
53
+ if skipped_pdfs:
54
+ print(f"⚠️ Skipping {len(skipped_pdfs)} files (too large):")
55
+ for name, size in skipped_pdfs[:5]: # Show first 5
56
+ print(f" - {name}: {size:.1f}MB")
57
+ if len(skipped_pdfs) > 5:
58
+ print(f" ... and {len(skipped_pdfs) - 5} more")
59
+
60
+ if not valid_pdfs:
61
+ print("❌ No valid PDF files to upload. Exiting.")
62
+ return
63
+
64
+ # Login to HF
65
+ hf_token = token or os.getenv("HF_TOKEN")
66
+ if not hf_token:
67
+ raise ValueError("HF_TOKEN not set. Pass --token or set HF_TOKEN environment variable.")
68
+
69
+ login(token=hf_token, add_to_git_credential=False)
70
+ api = HfApi()
71
+
72
+ # Create repo if it doesn't exist
73
+ try:
74
+ api.create_repo(repo_id, repo_type="dataset", private=private, exist_ok=True)
75
+ print(f"βœ… Repository ready: https://huggingface.co/datasets/{repo_id}")
76
+ except Exception as e:
77
+ print(f"⚠️ Repo creation note: {e}")
78
+
79
+ # Upload in batches
80
+ total_batches = (len(valid_pdfs) + batch_size - 1) // batch_size
81
+
82
+ for batch_num in range(total_batches):
83
+ start_idx = batch_num * batch_size
84
+ end_idx = min(start_idx + batch_size, len(valid_pdfs))
85
+ batch_files = valid_pdfs[start_idx:end_idx]
86
+
87
+ print(f"\nπŸš€ Uploading batch {batch_num + 1}/{total_batches} ({len(batch_files)} files)...")
88
+
89
+ # Create temp directory with just this batch
90
+ with tempfile.TemporaryDirectory() as temp_dir:
91
+ for pdf in batch_files:
92
+ shutil.copy2(pdf, temp_dir)
93
+
94
+ api.upload_folder(
95
+ folder_path=temp_dir,
96
+ repo_id=repo_id,
97
+ repo_type="dataset",
98
+ commit_message=f"Upload batch {batch_num + 1}/{total_batches} ({len(batch_files)} PDFs)",
99
+ )
100
+
101
+ print(f" βœ… Batch {batch_num + 1} complete")
102
+
103
+ print(f"\nπŸŽ‰ Upload complete! {len(valid_pdfs)} files uploaded.")
104
+ print(f"πŸ“Ž View at: https://huggingface.co/datasets/{repo_id}")
105
+ print(f"\nπŸ’‘ To use in app, set:")
106
+ print(f" --pdf-url-base https://huggingface.co/datasets/{repo_id}/resolve/main/")
107
+
108
+
109
+ if __name__ == "__main__":
110
+ parser = argparse.ArgumentParser(description="Upload PDFs to Hugging Face Datasets")
111
+ parser.add_argument("--repo-id", "-r", required=True,
112
+ help="HF dataset repo ID (e.g., username/reliefweb-pdfs)")
113
+ parser.add_argument("--pdf-dir", "-d", required=True,
114
+ help="Local directory containing PDF files")
115
+ parser.add_argument("--token", "-t", default=None,
116
+ help="HF write token (or set HF_TOKEN env var)")
117
+ parser.add_argument("--public", action="store_true",
118
+ help="Make the dataset public (default: private)")
119
+ parser.add_argument("--max-size-mb", type=float, default=50,
120
+ help="Skip files larger than this (MB, default: 50)")
121
+ parser.add_argument("--batch-size", type=int, default=100,
122
+ help="Upload in batches of N files (default: 100)")
123
+
124
+ args = parser.parse_args()
125
+
126
+ upload_pdfs(
127
+ repo_id=args.repo_id,
128
+ pdf_dir=args.pdf_dir,
129
+ token=args.token,
130
+ private=not args.public,
131
+ max_size_mb=args.max_size_mb,
132
+ batch_size=args.batch_size
133
+ )