| | """Disk-based metadata cache + thread-safe in-memory container for background generation."""
|
| | from __future__ import annotations
|
| |
|
| | import hashlib
|
| | import json
|
| | import logging
|
| | import threading
|
| | from pathlib import Path
|
| |
|
| | logger = logging.getLogger(__name__)
|
| |
|
| |
|
| | CACHE_DIR = Path(__file__).resolve().parent.parent / ".cache" / "metadata"
|
| |
|
| |
|
| |
|
| |
|
| | _CACHE_VERSION = "v3"
|
| |
|
| |
|
| | def _pdf_hash(pdf_bytes: bytes) -> str:
|
| | """Compute a SHA-256 hash of the PDF bytes for cache keying."""
|
| | return hashlib.sha256(pdf_bytes).hexdigest()
|
| |
|
| |
|
| | def get_cached_metadata(pdf_bytes: bytes) -> list[dict] | None:
|
| | """Check if metadata exists on disk for the given PDF.
|
| |
|
| | Returns the metadata list if found, None otherwise.
|
| | """
|
| | cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
|
| | if cache_path.exists():
|
| | try:
|
| | return json.loads(cache_path.read_text(encoding="utf-8"))
|
| | except (json.JSONDecodeError, OSError):
|
| | return None
|
| | return None
|
| |
|
| |
|
| | def save_metadata(pdf_bytes: bytes, metadata_list: list[dict]) -> None:
|
| | """Save metadata to disk, keyed by PDF hash."""
|
| | CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| | cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
|
| | cache_path.write_text(
|
| | json.dumps(metadata_list, indent=2),
|
| | encoding="utf-8",
|
| | )
|
| |
|
| |
|
| | class MetadataState:
|
| | """Thread-safe container for background metadata generation state.
|
| |
|
| | Stored as a single object in ``st.session_state``. The background thread
|
| | mutates fields on *this same object* (safe under CPython's GIL for simple
|
| | attribute assignments). The main Streamlit thread reads from it on each
|
| | rerun.
|
| | """
|
| |
|
| | def __init__(self) -> None:
|
| | self.status: str = "not_started"
|
| | self.data_json: str = ""
|
| | self.error: str | None = None
|
| | self._lock = threading.Lock()
|
| |
|
| |
|
| |
|
| | def set_ready(self, data_json: str) -> None:
|
| | with self._lock:
|
| | self.data_json = data_json
|
| | self.status = "ready"
|
| |
|
| | def set_failed(self, error: str) -> None:
|
| | with self._lock:
|
| | self.error = error
|
| | self.status = "failed"
|
| |
|
| | def set_in_progress(self) -> None:
|
| | with self._lock:
|
| | self.status = "in_progress"
|
| |
|
| | @property
|
| | def is_ready(self) -> bool:
|
| | return self.status == "ready"
|
| |
|
| | def generate_sync(
|
| | self,
|
| | pdf_path: str,
|
| | num_pages: int,
|
| | pdf_bytes: bytes,
|
| | progress_callback=None,
|
| | ) -> None:
|
| | """Generate metadata synchronously (blocking).
|
| |
|
| | Same logic as ``start_background_generation`` but runs in the calling
|
| | thread. Used during initialization so metadata is ready before the
|
| | user can ask questions.
|
| |
|
| | Args:
|
| | progress_callback: Optional ``(completed, total, label) -> None``
|
| | forwarded to ``generate_page_metadata`` for progress reporting.
|
| | """
|
| | self.set_in_progress()
|
| | try:
|
| | from nodes.metadata_generator import generate_page_metadata
|
| |
|
| | metadata_list = generate_page_metadata(
|
| | pdf_path, num_pages, progress_callback=progress_callback,
|
| | )
|
| | save_metadata(pdf_bytes, metadata_list)
|
| | self.set_ready(json.dumps(metadata_list, indent=2))
|
| | logger.info("Metadata generation complete.")
|
| | except Exception as e:
|
| | self.set_failed(str(e))
|
| | logger.exception("Metadata generation failed")
|
| |
|
| | def start_background_generation(
|
| | self,
|
| | pdf_path: str,
|
| | num_pages: int,
|
| | pdf_bytes: bytes,
|
| | ) -> None:
|
| | """Launch a daemon thread that generates metadata and writes to disk cache."""
|
| | self.set_in_progress()
|
| |
|
| | def _run():
|
| | try:
|
| | from nodes.metadata_generator import generate_page_metadata
|
| |
|
| | metadata_list = generate_page_metadata(pdf_path, num_pages)
|
| | save_metadata(pdf_bytes, metadata_list)
|
| | self.set_ready(json.dumps(metadata_list, indent=2))
|
| | logger.info("Background metadata generation complete.")
|
| | except Exception as e:
|
| | self.set_failed(str(e))
|
| | logger.exception("Background metadata generation failed")
|
| |
|
| | thread = threading.Thread(target=_run, daemon=True)
|
| | thread.start()
|
| |
|