"""Disk-based metadata cache + thread-safe in-memory container for background generation.""" from __future__ import annotations import hashlib import json import logging import threading from pathlib import Path logger = logging.getLogger(__name__) # Cache directory local to the project CACHE_DIR = Path(__file__).resolve().parent.parent / ".cache" / "metadata" # Cache version — bump this when the metadata format changes to invalidate old caches. # v2: switched page_num from 0-indexed to 1-indexed # v3: removed related_legends, has_title_block, title_block_text; parallel batch generation _CACHE_VERSION = "v3" def _pdf_hash(pdf_bytes: bytes) -> str: """Compute a SHA-256 hash of the PDF bytes for cache keying.""" return hashlib.sha256(pdf_bytes).hexdigest() def get_cached_metadata(pdf_bytes: bytes) -> list[dict] | None: """Check if metadata exists on disk for the given PDF. Returns the metadata list if found, None otherwise. """ cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json" if cache_path.exists(): try: return json.loads(cache_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): return None return None def save_metadata(pdf_bytes: bytes, metadata_list: list[dict]) -> None: """Save metadata to disk, keyed by PDF hash.""" CACHE_DIR.mkdir(parents=True, exist_ok=True) cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json" cache_path.write_text( json.dumps(metadata_list, indent=2), encoding="utf-8", ) class MetadataState: """Thread-safe container for background metadata generation state. Stored as a single object in ``st.session_state``. The background thread mutates fields on *this same object* (safe under CPython's GIL for simple attribute assignments). The main Streamlit thread reads from it on each rerun. """ def __init__(self) -> None: self.status: str = "not_started" # not_started | in_progress | ready | failed self.data_json: str = "" # pre-serialized JSON for the planner self.error: str | None = None self._lock = threading.Lock() # -- convenience helpers -------------------------------------------------- def set_ready(self, data_json: str) -> None: with self._lock: self.data_json = data_json self.status = "ready" def set_failed(self, error: str) -> None: with self._lock: self.error = error self.status = "failed" def set_in_progress(self) -> None: with self._lock: self.status = "in_progress" @property def is_ready(self) -> bool: return self.status == "ready" def generate_sync( self, pdf_path: str, num_pages: int, pdf_bytes: bytes, progress_callback=None, ) -> None: """Generate metadata synchronously (blocking). Same logic as ``start_background_generation`` but runs in the calling thread. Used during initialization so metadata is ready before the user can ask questions. Args: progress_callback: Optional ``(completed, total, label) -> None`` forwarded to ``generate_page_metadata`` for progress reporting. """ self.set_in_progress() try: from nodes.metadata_generator import generate_page_metadata metadata_list = generate_page_metadata( pdf_path, num_pages, progress_callback=progress_callback, ) save_metadata(pdf_bytes, metadata_list) self.set_ready(json.dumps(metadata_list, indent=2)) logger.info("Metadata generation complete.") except Exception as e: self.set_failed(str(e)) logger.exception("Metadata generation failed") def start_background_generation( self, pdf_path: str, num_pages: int, pdf_bytes: bytes, ) -> None: """Launch a daemon thread that generates metadata and writes to disk cache.""" self.set_in_progress() def _run(): try: from nodes.metadata_generator import generate_page_metadata metadata_list = generate_page_metadata(pdf_path, num_pages) save_metadata(pdf_bytes, metadata_list) self.set_ready(json.dumps(metadata_list, indent=2)) logger.info("Background metadata generation complete.") except Exception as e: self.set_failed(str(e)) logger.exception("Background metadata generation failed") thread = threading.Thread(target=_run, daemon=True) thread.start()