|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentRegistry: |
|
|
def __init__(self): |
|
|
|
|
|
self._registry = {} |
|
|
|
|
|
def register(self, file_path, chunks, embeddings, index, toc_source="unknown"): |
|
|
""" |
|
|
Registers a new document in the in-memory registry. |
|
|
|
|
|
Args: |
|
|
file_path (str): Path to the uploaded or sample PDF. |
|
|
chunks (list): List of text chunks extracted from the document. |
|
|
embeddings (list): Corresponding vector embeddings. |
|
|
index (FAISS Index): Search index for this document. |
|
|
toc_source (str): How the Table of Contents was detected (heuristic/ai_inferred). |
|
|
""" |
|
|
import os |
|
|
name = os.path.basename(file_path) |
|
|
entry = { |
|
|
"name": name, |
|
|
"num_chunks": len(chunks), |
|
|
"toc_source": toc_source, |
|
|
"chunks": chunks, |
|
|
"embeddings": embeddings, |
|
|
"index": index |
|
|
} |
|
|
|
|
|
|
|
|
self._registry[name] = entry |
|
|
print(f"π Registered {name} ({len(chunks)} chunks)") |
|
|
return name |
|
|
|
|
|
def list_docs(self): |
|
|
"""Return a list of all registered documents with summary info.""" |
|
|
return list(self._registry.values()) |
|
|
|
|
|
def get_doc(self, name): |
|
|
"""Retrieve full document entry by name (for active context switching).""" |
|
|
return self._registry.get(name) |
|
|
|
|
|
def clear(self): |
|
|
"""Optional helper to clear all registry entries.""" |
|
|
self._registry.clear() |
|
|
|