Shubham170793 commited on
Commit
0907913
·
verified ·
1 Parent(s): dc571c1

Create document_registry.py

Browse files
Files changed (1) hide show
  1. src/document_registry.py +39 -0
src/document_registry.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================================================
2
+ # document_registry.py — lightweight per-session library
3
+ # ==========================================================
4
+ import os, hashlib, time
5
+
6
+ class DocumentRegistry:
7
+ def __init__(self):
8
+ self.docs = {}
9
+
10
+ def _hash_file(self, file_path):
11
+ """Compute short content hash to detect truly identical files"""
12
+ hasher = hashlib.sha256()
13
+ with open(file_path, "rb") as f:
14
+ while chunk := f.read(8192):
15
+ hasher.update(chunk)
16
+ return hasher.hexdigest()[:16]
17
+
18
+ def register(self, file_path, chunks, embeddings, index):
19
+ """Store one processed document"""
20
+ file_name = os.path.basename(file_path)
21
+ content_hash = self._hash_file(file_path)
22
+ doc_id = f"{file_name}_{content_hash}"
23
+ self.docs[doc_id] = {
24
+ "doc_name": file_name,
25
+ "hash": content_hash,
26
+ "path": file_path,
27
+ "chunks": chunks,
28
+ "embeddings": embeddings,
29
+ "index": index,
30
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
31
+ }
32
+ print(f"📚 Registered {file_name} ({len(chunks)} chunks)")
33
+ return doc_id
34
+
35
+ def get(self, doc_id):
36
+ return self.docs.get(doc_id)
37
+
38
+ def list_docs(self):
39
+ return list(self.docs.values())