cryogenic22 commited on
Commit
9b68bd7
·
verified ·
1 Parent(s): ebe022a

Create storage.py

Browse files
Files changed (1) hide show
  1. utils/storage.py +189 -0
utils/storage.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/storage.py
2
+
3
+ import os
4
+ import shutil
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Optional
8
+ import faiss
9
+ import pickle
10
+ import streamlit as st
11
+ from datetime import datetime
12
+ import numpy as np
13
+
14
+ class PersistentStorage:
15
+ """Handles persistent storage for the application."""
16
+
17
+ def __init__(self):
18
+ # Base paths
19
+ self.base_path = Path("/data")
20
+
21
+ # Create necessary subdirectories
22
+ self.db_path = self.base_path / "database"
23
+ self.files_path = self.base_path / "files"
24
+ self.vectorstore_path = self.base_path / "vectorstore"
25
+ self.metadata_path = self.base_path / "metadata"
26
+
27
+ # Ensure directories exist
28
+ self._create_directories()
29
+
30
+ def _create_directories(self):
31
+ """Create necessary directory structure."""
32
+ for path in [self.db_path, self.files_path, self.vectorstore_path, self.metadata_path]:
33
+ path.mkdir(parents=True, exist_ok=True)
34
+
35
+ def get_db_path(self) -> str:
36
+ """Get the path to the SQLite database file."""
37
+ return str(self.db_path / "rfp_analysis.db")
38
+
39
+ def save_uploaded_file(self, uploaded_file, collection_id: Optional[int] = None) -> Path:
40
+ """Save an uploaded file to persistent storage."""
41
+ # Create collection subdirectory if needed
42
+ if collection_id:
43
+ save_dir = self.files_path / str(collection_id)
44
+ save_dir.mkdir(exist_ok=True)
45
+ else:
46
+ save_dir = self.files_path
47
+
48
+ # Create timestamped filename
49
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
50
+ filename = f"{timestamp}_{uploaded_file.name}"
51
+ file_path = save_dir / filename
52
+
53
+ # Save file
54
+ with file_path.open("wb") as f:
55
+ f.write(uploaded_file.getbuffer())
56
+
57
+ # Save metadata
58
+ metadata = {
59
+ "original_name": uploaded_file.name,
60
+ "upload_time": timestamp,
61
+ "collection_id": collection_id,
62
+ "size": uploaded_file.size,
63
+ "type": uploaded_file.type
64
+ }
65
+ self._save_metadata(file_path.stem, metadata)
66
+
67
+ return file_path
68
+
69
+ def _save_metadata(self, file_id: str, metadata: dict):
70
+ """Save metadata for a file."""
71
+ metadata_file = self.metadata_path / f"{file_id}.json"
72
+ with metadata_file.open("w") as f:
73
+ json.dump(metadata, f)
74
+
75
+ def save_vectorstore(self, vectorstore, collection_id: Optional[int] = None):
76
+ """Save FAISS vector store to persistent storage."""
77
+ # Determine save path
78
+ if collection_id:
79
+ save_path = self.vectorstore_path / f"collection_{collection_id}"
80
+ else:
81
+ save_path = self.vectorstore_path / "main"
82
+
83
+ save_path.mkdir(exist_ok=True)
84
+
85
+ # Save the index
86
+ faiss.write_index(vectorstore.index, str(save_path / "index.faiss"))
87
+
88
+ # Save the documents and metadata
89
+ with (save_path / "store.pkl").open("wb") as f:
90
+ store_data = {
91
+ "documents": vectorstore.docstore._dict,
92
+ "index_to_docstore_id": vectorstore.index_to_docstore_id
93
+ }
94
+ pickle.dump(store_data, f)
95
+
96
+ def load_vectorstore(self, collection_id: Optional[int] = None):
97
+ """Load FAISS vector store from persistent storage."""
98
+ # Determine load path
99
+ if collection_id:
100
+ load_path = self.vectorstore_path / f"collection_{collection_id}"
101
+ else:
102
+ load_path = self.vectorstore_path / "main"
103
+
104
+ if not load_path.exists():
105
+ return None
106
+
107
+ try:
108
+ # Load the index
109
+ index = faiss.read_index(str(load_path / "index.faiss"))
110
+
111
+ # Load the documents and metadata
112
+ with (load_path / "store.pkl").open("rb") as f:
113
+ store_data = pickle.load(f)
114
+
115
+ # Reconstruct the vector store
116
+ vectorstore = FAISS(
117
+ embedding_function=get_embeddings_model(),
118
+ index=index,
119
+ docstore=store_data["documents"],
120
+ index_to_docstore_id=store_data["index_to_docstore_id"]
121
+ )
122
+
123
+ return vectorstore
124
+ except Exception as e:
125
+ st.error(f"Error loading vector store: {e}")
126
+ return None
127
+
128
+ def get_file_path(self, file_id: str, collection_id: Optional[int] = None) -> Optional[Path]:
129
+ """Get the path to a stored file."""
130
+ if collection_id:
131
+ file_path = self.files_path / str(collection_id) / file_id
132
+ else:
133
+ file_path = self.files_path / file_id
134
+
135
+ return file_path if file_path.exists() else None
136
+
137
+ def cleanup_old_files(self, max_age_days: int = 30):
138
+ """Clean up files older than specified days."""
139
+ current_time = datetime.now()
140
+
141
+ for file_path in self.files_path.rglob("*"):
142
+ if file_path.is_file():
143
+ file_age = current_time - datetime.fromtimestamp(file_path.stat().st_mtime)
144
+ if file_age.days > max_age_days:
145
+ file_path.unlink()
146
+
147
+ # Remove associated metadata
148
+ metadata_file = self.metadata_path / f"{file_path.stem}.json"
149
+ if metadata_file.exists():
150
+ metadata_file.unlink()
151
+
152
+ # Update database.py to use persistent storage
153
+ def create_connection(storage):
154
+ """Create database connection using persistent storage."""
155
+ try:
156
+ conn = sqlite3.connect(storage.get_db_path(), check_same_thread=False)
157
+ return conn
158
+ except Error as e:
159
+ st.error(f"Failed to connect to database: {e}")
160
+ return None
161
+
162
+ # Update document handling to use persistent storage
163
+ def handle_document_upload(uploaded_files, **kwargs):
164
+ try:
165
+ storage = PersistentStorage()
166
+ collection_id = kwargs.get('collection_id')
167
+
168
+ for uploaded_file in uploaded_files:
169
+ # Save file to persistent storage
170
+ file_path = storage.save_uploaded_file(uploaded_file, collection_id)
171
+
172
+ # Process document
173
+ chunks, content = process_document(str(file_path))
174
+
175
+ # Store in database
176
+ doc_id = insert_document(st.session_state.db_conn, uploaded_file.name, content)
177
+
178
+ # Add to collection if specified
179
+ if collection_id:
180
+ add_document_to_collection(st.session_state.db_conn, doc_id, collection_id)
181
+
182
+ # Update vector store
183
+ vector_store = process_chunks_to_vectorstore(chunks)
184
+ storage.save_vectorstore(vector_store, collection_id)
185
+
186
+ return True
187
+ except Exception as e:
188
+ st.error(f"Error processing documents: {e}")
189
+ return False