Spaces:
Sleeping
Sleeping
Commit
Β·
1092ade
1
Parent(s):
7dd10e6
ad caching
Browse files
app.py
CHANGED
|
@@ -42,7 +42,7 @@ class ValidationAnnotator:
|
|
| 42 |
"""
|
| 43 |
|
| 44 |
def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
|
| 45 |
-
pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None):
|
| 46 |
self.input_file = Path(input_file)
|
| 47 |
self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl"
|
| 48 |
|
|
@@ -53,6 +53,8 @@ class ValidationAnnotator:
|
|
| 53 |
# PDF configuration
|
| 54 |
self.pdf_dir = Path(pdf_dir) if pdf_dir else None
|
| 55 |
self.pdf_url_base = pdf_url_base
|
|
|
|
|
|
|
| 56 |
if self.pdf_dir and not self.pdf_dir.exists():
|
| 57 |
print(f"β οΈ PDF directory not found: {self.pdf_dir}")
|
| 58 |
self.hf_enabled = False
|
|
@@ -471,6 +473,28 @@ class ValidationAnnotator:
|
|
| 471 |
print(f"π Found PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True)
|
| 472 |
else:
|
| 473 |
print(f"β οΈ PDF file not found: {pdf_path}", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
elif source_doc and self.pdf_url_base:
|
| 475 |
# Remote PDF via URL (e.g., HF Datasets)
|
| 476 |
# Remove any leading slashes from source_doc
|
|
@@ -549,9 +573,9 @@ class ValidationAnnotator:
|
|
| 549 |
|
| 550 |
|
| 551 |
def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
|
| 552 |
-
pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None):
|
| 553 |
"""Create and configure Gradio app."""
|
| 554 |
-
annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base)
|
| 555 |
|
| 556 |
# Custom CSS for the green button and dark mode toggle
|
| 557 |
css = """
|
|
@@ -943,6 +967,7 @@ if __name__ == "__main__":
|
|
| 943 |
pdf_url_base = args.pdf_url_base
|
| 944 |
|
| 945 |
# If no explicit PDF source, check for HF PDF repo environment variable
|
|
|
|
| 946 |
if not pdf_dir and not pdf_url_base:
|
| 947 |
hf_pdf_repo = os.getenv("HF_RELIEFWEB_PDFS_REPO") # e.g., "ai4data/reliefweb-pdfs"
|
| 948 |
if hf_pdf_repo:
|
|
@@ -951,15 +976,20 @@ if __name__ == "__main__":
|
|
| 951 |
# Already a full URL, use it directly (ensure it ends with /)
|
| 952 |
pdf_url_base = hf_pdf_repo.rstrip('/') + '/'
|
| 953 |
else:
|
| 954 |
-
# Repo ID format
|
|
|
|
|
|
|
| 955 |
pdf_url_base = f"https://huggingface.co/datasets/{hf_pdf_repo}/resolve/main/"
|
|
|
|
| 956 |
print(f"π Using HF PDF repository: {hf_pdf_repo}", flush=True)
|
| 957 |
-
|
|
|
|
|
|
|
| 958 |
else:
|
| 959 |
print("β οΈ No PDF source configured. Set --pdf-dir, --pdf-url-base, or HF_RELIEFWEB_PDFS_REPO.", flush=True)
|
| 960 |
|
| 961 |
# Create and launch the app
|
| 962 |
-
app = create_app(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base)
|
| 963 |
|
| 964 |
# Ensure allowed paths are absolute for Gradio (only needed for local files)
|
| 965 |
allowed = []
|
|
@@ -968,6 +998,15 @@ if __name__ == "__main__":
|
|
| 968 |
allowed = [pdf_dir_parent]
|
| 969 |
print(f"π Launching with allowed_paths: {allowed}", flush=True)
|
| 970 |
print(f"π PDF Directory Check: {Path(pdf_dir).exists()}", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 971 |
else:
|
| 972 |
print("π Launching with remote PDF URLs (no local allowed_paths needed)", flush=True)
|
| 973 |
|
|
|
|
| 42 |
"""
|
| 43 |
|
| 44 |
def __init__(self, input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
|
| 45 |
+
pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None, pdf_repo_id: Optional[str] = None):
|
| 46 |
self.input_file = Path(input_file)
|
| 47 |
self.output_file = self.input_file.parent / f"{self.input_file.stem}_human_validated.jsonl"
|
| 48 |
|
|
|
|
| 53 |
# PDF configuration
|
| 54 |
self.pdf_dir = Path(pdf_dir) if pdf_dir else None
|
| 55 |
self.pdf_url_base = pdf_url_base
|
| 56 |
+
self.pdf_repo_id = pdf_repo_id
|
| 57 |
+
|
| 58 |
if self.pdf_dir and not self.pdf_dir.exists():
|
| 59 |
print(f"β οΈ PDF directory not found: {self.pdf_dir}")
|
| 60 |
self.hf_enabled = False
|
|
|
|
| 473 |
print(f"π Found PDF for sample {self.current_idx}: {pdf_value} (Page {page_num})", flush=True)
|
| 474 |
else:
|
| 475 |
print(f"β οΈ PDF file not found: {pdf_path}", flush=True)
|
| 476 |
+
elif source_doc and self.pdf_repo_id:
|
| 477 |
+
# Server-side caching via HF Hub (avoids CORS/frontend download issues)
|
| 478 |
+
# Remove leading slash if present
|
| 479 |
+
source_doc_clean = source_doc.lstrip('/')
|
| 480 |
+
try:
|
| 481 |
+
from huggingface_hub import hf_hub_download
|
| 482 |
+
print(f"π₯ Downloading/Caching PDF from {self.pdf_repo_id}: {source_doc_clean}", flush=True)
|
| 483 |
+
pdf_path_cached = hf_hub_download(
|
| 484 |
+
repo_id=self.pdf_repo_id,
|
| 485 |
+
filename=source_doc_clean,
|
| 486 |
+
repo_type="dataset",
|
| 487 |
+
token=self.hf_token
|
| 488 |
+
)
|
| 489 |
+
pdf_value = str(pdf_path_cached)
|
| 490 |
+
print(f"π¦ Cached local path: {pdf_value}", flush=True)
|
| 491 |
+
except Exception as e:
|
| 492 |
+
print(f"β Failed to download PDF: {e}", flush=True)
|
| 493 |
+
# Fallback to URL if download fails and url base is available
|
| 494 |
+
if self.pdf_url_base:
|
| 495 |
+
pdf_value = f"{self.pdf_url_base.rstrip('/')}/{source_doc_clean}"
|
| 496 |
+
print(f"β οΈ Falling back to remote URL: {pdf_value}", flush=True)
|
| 497 |
+
|
| 498 |
elif source_doc and self.pdf_url_base:
|
| 499 |
# Remote PDF via URL (e.g., HF Datasets)
|
| 500 |
# Remove any leading slashes from source_doc
|
|
|
|
| 573 |
|
| 574 |
|
| 575 |
def create_app(input_file: str, hf_dataset_repo: Optional[str] = None, hf_token: Optional[str] = None,
|
| 576 |
+
pdf_dir: Optional[str] = None, pdf_url_base: Optional[str] = None, pdf_repo_id: Optional[str] = None):
|
| 577 |
"""Create and configure Gradio app."""
|
| 578 |
+
annotator = ValidationAnnotator(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base, pdf_repo_id)
|
| 579 |
|
| 580 |
# Custom CSS for the green button and dark mode toggle
|
| 581 |
css = """
|
|
|
|
| 967 |
pdf_url_base = args.pdf_url_base
|
| 968 |
|
| 969 |
# If no explicit PDF source, check for HF PDF repo environment variable
|
| 970 |
+
pdf_repo_id = None
|
| 971 |
if not pdf_dir and not pdf_url_base:
|
| 972 |
hf_pdf_repo = os.getenv("HF_RELIEFWEB_PDFS_REPO") # e.g., "ai4data/reliefweb-pdfs"
|
| 973 |
if hf_pdf_repo:
|
|
|
|
| 976 |
# Already a full URL, use it directly (ensure it ends with /)
|
| 977 |
pdf_url_base = hf_pdf_repo.rstrip('/') + '/'
|
| 978 |
else:
|
| 979 |
+
# Repo ID format - enabling server-side caching!
|
| 980 |
+
pdf_repo_id = hf_pdf_repo
|
| 981 |
+
# Also set url base as fallback
|
| 982 |
pdf_url_base = f"https://huggingface.co/datasets/{hf_pdf_repo}/resolve/main/"
|
| 983 |
+
|
| 984 |
print(f"π Using HF PDF repository: {hf_pdf_repo}", flush=True)
|
| 985 |
+
if pdf_repo_id:
|
| 986 |
+
print(f" π Server-side caching ENABLED for repo: {pdf_repo_id}", flush=True)
|
| 987 |
+
print(f" PDF URL base (fallback): {pdf_url_base}", flush=True)
|
| 988 |
else:
|
| 989 |
print("β οΈ No PDF source configured. Set --pdf-dir, --pdf-url-base, or HF_RELIEFWEB_PDFS_REPO.", flush=True)
|
| 990 |
|
| 991 |
# Create and launch the app
|
| 992 |
+
app = create_app(input_file, hf_dataset_repo, hf_token, pdf_dir, pdf_url_base, pdf_repo_id)
|
| 993 |
|
| 994 |
# Ensure allowed paths are absolute for Gradio (only needed for local files)
|
| 995 |
allowed = []
|
|
|
|
| 998 |
allowed = [pdf_dir_parent]
|
| 999 |
print(f"π Launching with allowed_paths: {allowed}", flush=True)
|
| 1000 |
print(f"π PDF Directory Check: {Path(pdf_dir).exists()}", flush=True)
|
| 1001 |
+
elif pdf_repo_id:
|
| 1002 |
+
# If caching from HF, we need to allow access to the HF cache directory
|
| 1003 |
+
# Typical path: ~/.cache/huggingface/hub
|
| 1004 |
+
# We'll allow the user's home directory to be safe/simple for now,
|
| 1005 |
+
# or we could try to resolve the specific cache path.
|
| 1006 |
+
# Allowing hierarchy up to home is usually robust for local caches.
|
| 1007 |
+
home_dir = str(Path.home().resolve())
|
| 1008 |
+
allowed = [home_dir]
|
| 1009 |
+
print(f"π Launching with cached HF PDFs - Allowing access to: {allowed}", flush=True)
|
| 1010 |
else:
|
| 1011 |
print("π Launching with remote PDF URLs (no local allowed_paths needed)", flush=True)
|
| 1012 |
|