rafmacalaba commited on
Commit
1092ade
Β·
1 Parent(s): 7dd10e6

ad caching

Browse files
Files changed (1) hide show
  1. app.py +45 -6
app.py CHANGED
@@ -42,7 +42,7 @@ class ValidationAnnotator:
42
  """
43
 
44
  def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
45
- pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None):
46
  self.input_file = Path(input_file)
47
  self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl"
48
 
@@ -53,6 +53,8 @@ class ValidationAnnotator:
53
  # PDF configuration
54
  self.pdf_dir = Path(pdf_dir) if pdf_dir else None
55
  self.pdf_url_base = pdf_url_base
 
 
56
  if self.pdf_dir and not self.pdf_dir.exists():
57
  print(f"⚠️ PDF directory not found: {self.pdf_dir}")
58
  self.hf_enabled = False
@@ -471,6 +473,28 @@ class ValidationAnnotator:
471
  print(f"πŸ“„ Found PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True)
472
  else:
473
  print(f"⚠️ PDF file not found: {pdf_path}", flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  elif source_doc and self.pdf_url_base:
475
  # Remote PDF via URL (e.g., HF Datasets)
476
  # Remove any leading slashes from source_doc
@@ -549,9 +573,9 @@ class ValidationAnnotator:
549
 
550
 
551
  def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
552
- pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None):
553
  """Create and configure Gradio app."""
554
- annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base)
555
 
556
  # Custom CSS for the green button and dark mode toggle
557
  css = """
@@ -943,6 +967,7 @@ if __name__ == "__main__":
943
  pdf_url_base = args.pdf_url_base
944
 
945
  # If no explicit PDF source, check for HF PDF repo environment variable
 
946
  if not pdf_dir and not pdf_url_base:
947
  hf_pdf_repo = os.getenv("HF_RELIEFWEB_PDFS_REPO") # e.g., "ai4data/reliefweb-pdfs"
948
  if hf_pdf_repo:
@@ -951,15 +976,20 @@ if __name__ == "__main__":
951
  # Already a full URL, use it directly (ensure it ends with /)
952
  pdf_url_base = hf_pdf_repo.rstrip('/') + '/'
953
  else:
954
- # Repo ID format, construct the URL
 
 
955
  pdf_url_base = f"https://huggingface.co/datasets/{hf_pdf_repo}/resolve/main/"
 
956
  print(f"🌐 Using HF PDF repository: {hf_pdf_repo}", flush=True)
957
- print(f" PDF URL base: {pdf_url_base}", flush=True)
 
 
958
  else:
959
  print("⚠️ No PDF source configured. Set --pdf-dir, --pdf-url-base, or HF_RELIEFWEB_PDFS_REPO.", flush=True)
960
 
961
  # Create and launch the app
962
- app = create_app(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base)
963
 
964
  # Ensure allowed paths are absolute for Gradio (only needed for local files)
965
  allowed = []
@@ -968,6 +998,15 @@ if __name__ == "__main__":
968
  allowed = [pdf_dir_parent]
969
  print(f"πŸš€ Launching with allowed_paths: {allowed}", flush=True)
970
  print(f"πŸ“‚ PDF Directory Check: {Path(pdf_dir).exists()}", flush=True)
 
 
 
 
 
 
 
 
 
971
  else:
972
  print("πŸš€ Launching with remote PDF URLs (no local allowed_paths needed)", flush=True)
973
 
 
42
  """
43
 
44
  def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
45
+ pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None, pdf_repo_id: Optional[str] = None):
46
  self.input_file = Path(input_file)
47
  self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl"
48
 
 
53
  # PDF configuration
54
  self.pdf_dir = Path(pdf_dir) if pdf_dir else None
55
  self.pdf_url_base = pdf_url_base
56
+ self.pdf_repo_id = pdf_repo_id
57
+
58
  if self.pdf_dir and not self.pdf_dir.exists():
59
  print(f"⚠️ PDF directory not found: {self.pdf_dir}")
60
  self.hf_enabled = False
 
473
  print(f"πŸ“„ Found PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True)
474
  else:
475
  print(f"⚠️ PDF file not found: {pdf_path}", flush=True)
476
+ elif source_doc and self.pdf_repo_id:
477
+ # Server-side caching via HF Hub (avoids CORS/frontend download issues)
478
+ # Remove leading slash if present
479
+ source_doc_clean = source_doc.lstrip('/')
480
+ try:
481
+ from huggingface_hub import hf_hub_download
482
+ print(f"πŸ“₯ Downloading/Caching PDF from {self.pdf_repo_id}: {source_doc_clean}", flush=True)
483
+ pdf_path_cached = hf_hub_download(
484
+ repo_id=self.pdf_repo_id,
485
+ filename=source_doc_clean,
486
+ repo_type="dataset",
487
+ token=self.hf_token
488
+ )
489
+ pdf_value = str(pdf_path_cached)
490
+ print(f"πŸ“¦ Cached local path: {pdf_value}", flush=True)
491
+ except Exception as e:
492
+ print(f"❌ Failed to download PDF: {e}", flush=True)
493
+ # Fallback to URL if download fails and url base is available
494
+ if self.pdf_url_base:
495
+ pdf_value = f"{self.pdf_url_base.rstrip('/')}/{source_doc_clean}"
496
+ print(f"⚠️ Falling back to remote URL: {pdf_value}", flush=True)
497
+
498
  elif source_doc and self.pdf_url_base:
499
  # Remote PDF via URL (e.g., HF Datasets)
500
  # Remove any leading slashes from source_doc
 
573
 
574
 
575
  def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
576
+ pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None, pdf_repo_id: Optional[str] = None):
577
  """Create and configure Gradio app."""
578
+ annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base, pdf_repo_id)
579
 
580
  # Custom CSS for the green button and dark mode toggle
581
  css = """
 
967
  pdf_url_base = args.pdf_url_base
968
 
969
  # If no explicit PDF source, check for HF PDF repo environment variable
970
+ pdf_repo_id = None
971
  if not pdf_dir and not pdf_url_base:
972
  hf_pdf_repo = os.getenv("HF_RELIEFWEB_PDFS_REPO") # e.g., "ai4data/reliefweb-pdfs"
973
  if hf_pdf_repo:
 
976
  # Already a full URL, use it directly (ensure it ends with /)
977
  pdf_url_base = hf_pdf_repo.rstrip('/') + '/'
978
  else:
979
+ # Repo ID format - enabling server-side caching!
980
+ pdf_repo_id = hf_pdf_repo
981
+ # Also set url base as fallback
982
  pdf_url_base = f"https://huggingface.co/datasets/{hf_pdf_repo}/resolve/main/"
983
+
984
  print(f"🌐 Using HF PDF repository: {hf_pdf_repo}", flush=True)
985
+ if pdf_repo_id:
986
+ print(f" πŸš€ Server-side caching ENABLED for repo: {pdf_repo_id}", flush=True)
987
+ print(f" PDF URL base (fallback): {pdf_url_base}", flush=True)
988
  else:
989
  print("⚠️ No PDF source configured. Set --pdf-dir, --pdf-url-base, or HF_RELIEFWEB_PDFS_REPO.", flush=True)
990
 
991
  # Create and launch the app
992
+ app = create_app(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base, pdf_repo_id)
993
 
994
  # Ensure allowed paths are absolute for Gradio (only needed for local files)
995
  allowed = []
 
998
  allowed = [pdf_dir_parent]
999
  print(f"πŸš€ Launching with allowed_paths: {allowed}", flush=True)
1000
  print(f"πŸ“‚ PDF Directory Check: {Path(pdf_dir).exists()}", flush=True)
1001
+ elif pdf_repo_id:
1002
+ # If caching from HF, we need to allow access to the HF cache directory
1003
+ # Typical path: ~/.cache/huggingface/hub
1004
+ # We'll allow the user's home directory to be safe/simple for now,
1005
+ # or we could try to resolve the specific cache path.
1006
+ # Allowing hierarchy up to home is usually robust for local caches.
1007
+ home_dir = str(Path.home().resolve())
1008
+ allowed = [home_dir]
1009
+ print(f"πŸš€ Launching with cached HF PDFs - Allowing access to: {allowed}", flush=True)
1010
  else:
1011
  print("πŸš€ Launching with remote PDF URLs (no local allowed_paths needed)", flush=True)
1012