Updated_code_complaince / tools /metadata_cache.py
Ryan2219's picture
Upload 70 files
e1ced8e verified
"""Disk-based metadata cache + thread-safe in-memory container for background generation."""
from __future__ import annotations
import hashlib
import json
import logging
import threading
from pathlib import Path
logger = logging.getLogger(__name__)
# Cache directory local to the project
CACHE_DIR = Path(__file__).resolve().parent.parent / ".cache" / "metadata"
# Cache version — bump this when the metadata format changes to invalidate old caches.
# v2: switched page_num from 0-indexed to 1-indexed
# v3: removed related_legends, has_title_block, title_block_text; parallel batch generation
_CACHE_VERSION = "v3"
def _pdf_hash(pdf_bytes: bytes) -> str:
"""Compute a SHA-256 hash of the PDF bytes for cache keying."""
return hashlib.sha256(pdf_bytes).hexdigest()
def get_cached_metadata(pdf_bytes: bytes) -> list[dict] | None:
"""Check if metadata exists on disk for the given PDF.
Returns the metadata list if found, None otherwise.
"""
cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
if cache_path.exists():
try:
return json.loads(cache_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return None
return None
def save_metadata(pdf_bytes: bytes, metadata_list: list[dict]) -> None:
"""Save metadata to disk, keyed by PDF hash."""
CACHE_DIR.mkdir(parents=True, exist_ok=True)
cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
cache_path.write_text(
json.dumps(metadata_list, indent=2),
encoding="utf-8",
)
class MetadataState:
"""Thread-safe container for background metadata generation state.
Stored as a single object in ``st.session_state``. The background thread
mutates fields on *this same object* (safe under CPython's GIL for simple
attribute assignments). The main Streamlit thread reads from it on each
rerun.
"""
def __init__(self) -> None:
self.status: str = "not_started" # not_started | in_progress | ready | failed
self.data_json: str = "" # pre-serialized JSON for the planner
self.error: str | None = None
self._lock = threading.Lock()
# -- convenience helpers --------------------------------------------------
def set_ready(self, data_json: str) -> None:
with self._lock:
self.data_json = data_json
self.status = "ready"
def set_failed(self, error: str) -> None:
with self._lock:
self.error = error
self.status = "failed"
def set_in_progress(self) -> None:
with self._lock:
self.status = "in_progress"
@property
def is_ready(self) -> bool:
return self.status == "ready"
def generate_sync(
self,
pdf_path: str,
num_pages: int,
pdf_bytes: bytes,
progress_callback=None,
) -> None:
"""Generate metadata synchronously (blocking).
Same logic as ``start_background_generation`` but runs in the calling
thread. Used during initialization so metadata is ready before the
user can ask questions.
Args:
progress_callback: Optional ``(completed, total, label) -> None``
forwarded to ``generate_page_metadata`` for progress reporting.
"""
self.set_in_progress()
try:
from nodes.metadata_generator import generate_page_metadata
metadata_list = generate_page_metadata(
pdf_path, num_pages, progress_callback=progress_callback,
)
save_metadata(pdf_bytes, metadata_list)
self.set_ready(json.dumps(metadata_list, indent=2))
logger.info("Metadata generation complete.")
except Exception as e:
self.set_failed(str(e))
logger.exception("Metadata generation failed")
def start_background_generation(
self,
pdf_path: str,
num_pages: int,
pdf_bytes: bytes,
) -> None:
"""Launch a daemon thread that generates metadata and writes to disk cache."""
self.set_in_progress()
def _run():
try:
from nodes.metadata_generator import generate_page_metadata
metadata_list = generate_page_metadata(pdf_path, num_pages)
save_metadata(pdf_bytes, metadata_list)
self.set_ready(json.dumps(metadata_list, indent=2))
logger.info("Background metadata generation complete.")
except Exception as e:
self.set_failed(str(e))
logger.exception("Background metadata generation failed")
thread = threading.Thread(target=_run, daemon=True)
thread.start()