Shubham170793 commited on
Commit
d5790e2
Β·
verified Β·
1 Parent(s): 7fa19c0

Update src/document_registry.py

Browse files
Files changed (1) hide show
  1. src/document_registry.py +34 -26
src/document_registry.py CHANGED
@@ -1,39 +1,47 @@
1
  # ==========================================================
2
- # document_registry.py β€” lightweight per-session library
3
  # ==========================================================
4
- import os, hashlib, time
5
 
6
  class DocumentRegistry:
7
  def __init__(self):
8
- self.docs = {}
 
9
 
10
- def _hash_file(self, file_path):
11
- """Compute short content hash to detect truly identical files"""
12
- hasher = hashlib.sha256()
13
- with open(file_path, "rb") as f:
14
- while chunk := f.read(8192):
15
- hasher.update(chunk)
16
- return hasher.hexdigest()[:16]
17
 
18
- def register(self, file_path, chunks, embeddings, index):
19
- """Store one processed document"""
20
- file_name = os.path.basename(file_path)
21
- content_hash = self._hash_file(file_path)
22
- doc_id = f"{file_name}_{content_hash}"
23
- self.docs[doc_id] = {
24
- "doc_name": file_name,
25
- "hash": content_hash,
26
- "path": file_path,
 
 
 
 
27
  "chunks": chunks,
28
  "embeddings": embeddings,
29
- "index": index,
30
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
31
  }
32
- print(f"πŸ“š Registered {file_name} ({len(chunks)} chunks)")
33
- return doc_id
34
 
35
- def get(self, doc_id):
36
- return self.docs.get(doc_id)
 
 
37
 
38
  def list_docs(self):
39
- return list(self.docs.values())
 
 
 
 
 
 
 
 
 
 
1
  # ==========================================================
2
+ # document_registry.py β€” Lightweight Registry for Uploaded Docs
3
  # ==========================================================
 
4
 
5
  class DocumentRegistry:
6
  def __init__(self):
7
+ # Internal registry for storing uploaded documents and metadata
8
+ self._registry = {}
9
 
10
+ def register(self, file_path, chunks, embeddings, index, toc_source="unknown"):
11
+ """
12
+ Registers a new document in the in-memory registry.
 
 
 
 
13
 
14
+ Args:
15
+ file_path (str): Path to the uploaded or sample PDF.
16
+ chunks (list): List of text chunks extracted from the document.
17
+ embeddings (list): Corresponding vector embeddings.
18
+ index (FAISS Index): Search index for this document.
19
+ toc_source (str): How the Table of Contents was detected (heuristic/ai_inferred).
20
+ """
21
+ import os
22
+ name = os.path.basename(file_path)
23
+ entry = {
24
+ "name": name,
25
+ "num_chunks": len(chunks),
26
+ "toc_source": toc_source,
27
  "chunks": chunks,
28
  "embeddings": embeddings,
29
+ "index": index
 
30
  }
 
 
31
 
32
+ # Store or replace entry by filename
33
+ self._registry[name] = entry
34
+ print(f"πŸ“š Registered {name} ({len(chunks)} chunks)")
35
+ return name # Return the doc ID (filename)
36
 
37
  def list_docs(self):
38
+ """Return a list of all registered documents with summary info."""
39
+ return list(self._registry.values())
40
+
41
+ def get_doc(self, name):
42
+ """Retrieve full document entry by name (for active context switching)."""
43
+ return self._registry.get(name)
44
+
45
+ def clear(self):
46
+ """Optional helper to clear all registry entries."""
47
+ self._registry.clear()