File size: 4,940 Bytes
e1ced8e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | """Disk-based metadata cache + thread-safe in-memory container for background generation."""
from __future__ import annotations
import hashlib
import json
import logging
import threading
from pathlib import Path
logger = logging.getLogger(__name__)
# Cache directory local to the project
CACHE_DIR = Path(__file__).resolve().parent.parent / ".cache" / "metadata"
# Cache version — bump this when the metadata format changes to invalidate old caches.
# v2: switched page_num from 0-indexed to 1-indexed
# v3: removed related_legends, has_title_block, title_block_text; parallel batch generation
_CACHE_VERSION = "v3"
def _pdf_hash(pdf_bytes: bytes) -> str:
"""Compute a SHA-256 hash of the PDF bytes for cache keying."""
return hashlib.sha256(pdf_bytes).hexdigest()
def get_cached_metadata(pdf_bytes: bytes) -> list[dict] | None:
"""Check if metadata exists on disk for the given PDF.
Returns the metadata list if found, None otherwise.
"""
cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
if cache_path.exists():
try:
return json.loads(cache_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return None
return None
def save_metadata(pdf_bytes: bytes, metadata_list: list[dict]) -> None:
"""Save metadata to disk, keyed by PDF hash."""
CACHE_DIR.mkdir(parents=True, exist_ok=True)
cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
cache_path.write_text(
json.dumps(metadata_list, indent=2),
encoding="utf-8",
)
class MetadataState:
"""Thread-safe container for background metadata generation state.
Stored as a single object in ``st.session_state``. The background thread
mutates fields on *this same object* (safe under CPython's GIL for simple
attribute assignments). The main Streamlit thread reads from it on each
rerun.
"""
def __init__(self) -> None:
self.status: str = "not_started" # not_started | in_progress | ready | failed
self.data_json: str = "" # pre-serialized JSON for the planner
self.error: str | None = None
self._lock = threading.Lock()
# -- convenience helpers --------------------------------------------------
def set_ready(self, data_json: str) -> None:
with self._lock:
self.data_json = data_json
self.status = "ready"
def set_failed(self, error: str) -> None:
with self._lock:
self.error = error
self.status = "failed"
def set_in_progress(self) -> None:
with self._lock:
self.status = "in_progress"
@property
def is_ready(self) -> bool:
return self.status == "ready"
def generate_sync(
self,
pdf_path: str,
num_pages: int,
pdf_bytes: bytes,
progress_callback=None,
) -> None:
"""Generate metadata synchronously (blocking).
Same logic as ``start_background_generation`` but runs in the calling
thread. Used during initialization so metadata is ready before the
user can ask questions.
Args:
progress_callback: Optional ``(completed, total, label) -> None``
forwarded to ``generate_page_metadata`` for progress reporting.
"""
self.set_in_progress()
try:
from nodes.metadata_generator import generate_page_metadata
metadata_list = generate_page_metadata(
pdf_path, num_pages, progress_callback=progress_callback,
)
save_metadata(pdf_bytes, metadata_list)
self.set_ready(json.dumps(metadata_list, indent=2))
logger.info("Metadata generation complete.")
except Exception as e:
self.set_failed(str(e))
logger.exception("Metadata generation failed")
def start_background_generation(
self,
pdf_path: str,
num_pages: int,
pdf_bytes: bytes,
) -> None:
"""Launch a daemon thread that generates metadata and writes to disk cache."""
self.set_in_progress()
def _run():
try:
from nodes.metadata_generator import generate_page_metadata
metadata_list = generate_page_metadata(pdf_path, num_pages)
save_metadata(pdf_bytes, metadata_list)
self.set_ready(json.dumps(metadata_list, indent=2))
logger.info("Background metadata generation complete.")
except Exception as e:
self.set_failed(str(e))
logger.exception("Background metadata generation failed")
thread = threading.Thread(target=_run, daemon=True)
thread.start()
|