File size: 4,940 Bytes
e1ced8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""Disk-based metadata cache + thread-safe in-memory container for background generation."""
from __future__ import annotations

import hashlib
import json
import logging
import threading
from pathlib import Path

logger = logging.getLogger(__name__)

# Cache directory local to the project
CACHE_DIR = Path(__file__).resolve().parent.parent / ".cache" / "metadata"

# Cache version — bump this when the metadata format changes to invalidate old caches.
# v2: switched page_num from 0-indexed to 1-indexed
# v3: removed related_legends, has_title_block, title_block_text; parallel batch generation
_CACHE_VERSION = "v3"


def _pdf_hash(pdf_bytes: bytes) -> str:
    """Compute a SHA-256 hash of the PDF bytes for cache keying."""
    return hashlib.sha256(pdf_bytes).hexdigest()


def get_cached_metadata(pdf_bytes: bytes) -> list[dict] | None:
    """Check if metadata exists on disk for the given PDF.



    Returns the metadata list if found, None otherwise.

    """
    cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
    if cache_path.exists():
        try:
            return json.loads(cache_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            return None
    return None


def save_metadata(pdf_bytes: bytes, metadata_list: list[dict]) -> None:
    """Save metadata to disk, keyed by PDF hash."""
    CACHE_DIR.mkdir(parents=True, exist_ok=True)
    cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
    cache_path.write_text(
        json.dumps(metadata_list, indent=2),
        encoding="utf-8",
    )


class MetadataState:
    """Thread-safe container for background metadata generation state.



    Stored as a single object in ``st.session_state``.  The background thread

    mutates fields on *this same object* (safe under CPython's GIL for simple

    attribute assignments).  The main Streamlit thread reads from it on each

    rerun.

    """

    def __init__(self) -> None:
        self.status: str = "not_started"  # not_started | in_progress | ready | failed
        self.data_json: str = ""          # pre-serialized JSON for the planner
        self.error: str | None = None
        self._lock = threading.Lock()

    # -- convenience helpers --------------------------------------------------

    def set_ready(self, data_json: str) -> None:
        with self._lock:
            self.data_json = data_json
            self.status = "ready"

    def set_failed(self, error: str) -> None:
        with self._lock:
            self.error = error
            self.status = "failed"

    def set_in_progress(self) -> None:
        with self._lock:
            self.status = "in_progress"

    @property
    def is_ready(self) -> bool:
        return self.status == "ready"

    def generate_sync(

        self,

        pdf_path: str,

        num_pages: int,

        pdf_bytes: bytes,

        progress_callback=None,

    ) -> None:
        """Generate metadata synchronously (blocking).



        Same logic as ``start_background_generation`` but runs in the calling

        thread.  Used during initialization so metadata is ready before the

        user can ask questions.



        Args:

            progress_callback: Optional ``(completed, total, label) -> None``

                forwarded to ``generate_page_metadata`` for progress reporting.

        """
        self.set_in_progress()
        try:
            from nodes.metadata_generator import generate_page_metadata

            metadata_list = generate_page_metadata(
                pdf_path, num_pages, progress_callback=progress_callback,
            )
            save_metadata(pdf_bytes, metadata_list)
            self.set_ready(json.dumps(metadata_list, indent=2))
            logger.info("Metadata generation complete.")
        except Exception as e:
            self.set_failed(str(e))
            logger.exception("Metadata generation failed")

    def start_background_generation(

        self,

        pdf_path: str,

        num_pages: int,

        pdf_bytes: bytes,

    ) -> None:
        """Launch a daemon thread that generates metadata and writes to disk cache."""
        self.set_in_progress()

        def _run():
            try:
                from nodes.metadata_generator import generate_page_metadata

                metadata_list = generate_page_metadata(pdf_path, num_pages)
                save_metadata(pdf_bytes, metadata_list)
                self.set_ready(json.dumps(metadata_list, indent=2))
                logger.info("Background metadata generation complete.")
            except Exception as e:
                self.set_failed(str(e))
                logger.exception("Background metadata generation failed")

        thread = threading.Thread(target=_run, daemon=True)
        thread.start()