Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 25

Commit

d5790e2

verified ·

1 Parent(s): 7fa19c0

Update src/document_registry.py

Browse files

Files changed (1) hide show

src/document_registry.py +34 -26

src/document_registry.py CHANGED Viewed

@@ -1,39 +1,47 @@
 # ==========================================================
-# document_registry.py — lightweight per-session library
 # ==========================================================
-import os, hashlib, time
 class DocumentRegistry:
     def __init__(self):
-        self.docs = {}
-    def _hash_file(self, file_path):
-        """Compute short content hash to detect truly identical files"""
-        hasher = hashlib.sha256()
-        with open(file_path, "rb") as f:
-            while chunk := f.read(8192):
-                hasher.update(chunk)
-        return hasher.hexdigest()[:16]
-    def register(self, file_path, chunks, embeddings, index):
-        """Store one processed document"""
-        file_name = os.path.basename(file_path)
-        content_hash = self._hash_file(file_path)
-        doc_id = f"{file_name}_{content_hash}"
-        self.docs[doc_id] = {
-            "doc_name": file_name,
-            "hash": content_hash,
-            "path": file_path,
             "chunks": chunks,
             "embeddings": embeddings,
-            "index": index,
-            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
         }
-        print(f"📚 Registered {file_name} ({len(chunks)} chunks)")
-        return doc_id
-    def get(self, doc_id):
-        return self.docs.get(doc_id)
     def list_docs(self):
-        return list(self.docs.values())

 # ==========================================================
+# document_registry.py — Lightweight Registry for Uploaded Docs
 # ==========================================================
 class DocumentRegistry:
     def __init__(self):
+        # Internal registry for storing uploaded documents and metadata
+        self._registry = {}
+    def register(self, file_path, chunks, embeddings, index, toc_source="unknown"):
+        """
+        Registers a new document in the in-memory registry.
+        Args:
+            file_path (str): Path to the uploaded or sample PDF.
+            chunks (list): List of text chunks extracted from the document.
+            embeddings (list): Corresponding vector embeddings.
+            index (FAISS Index): Search index for this document.
+            toc_source (str): How the Table of Contents was detected (heuristic/ai_inferred).
+        """
+        import os
+        name = os.path.basename(file_path)
+        entry = {
+            "name": name,
+            "num_chunks": len(chunks),
+            "toc_source": toc_source,
             "chunks": chunks,
             "embeddings": embeddings,
+            "index": index
         }
+        # Store or replace entry by filename
+        self._registry[name] = entry
+        print(f"📚 Registered {name} ({len(chunks)} chunks)")
+        return name  # Return the doc ID (filename)
     def list_docs(self):
+        """Return a list of all registered documents with summary info."""
+        return list(self._registry.values())
+    def get_doc(self, name):
+        """Retrieve full document entry by name (for active context switching)."""
+        return self._registry.get(name)
+    def clear(self):
+        """Optional helper to clear all registry entries."""
+        self._registry.clear()