Spaces:
Build error
Build error
Update utils/database.py
Browse files- utils/database.py +114 -0
utils/database.py
CHANGED
|
@@ -1108,6 +1108,30 @@ def process_document(file_path):
|
|
| 1108 |
return chunks, full_content
|
| 1109 |
|
| 1110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1111 |
def display_vector_store_info():
|
| 1112 |
"""
|
| 1113 |
Display information about the current vector store state.
|
|
@@ -1154,6 +1178,96 @@ def display_vector_store_info():
|
|
| 1154 |
st.error(traceback.format_exc())
|
| 1155 |
|
| 1156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1157 |
def initialize_qa_system(vector_store):
|
| 1158 |
"""
|
| 1159 |
Initialize QA system with optimized retrieval.
|
|
|
|
| 1108 |
return chunks, full_content
|
| 1109 |
|
| 1110 |
|
| 1111 |
+
def delete_collection(conn: sqlite3.Connection, collection_id: int) -> bool:
|
| 1112 |
+
"""Delete a collection and its associations."""
|
| 1113 |
+
try:
|
| 1114 |
+
with conn_lock:
|
| 1115 |
+
cursor = conn.cursor()
|
| 1116 |
+
# Delete the collection's document associations first
|
| 1117 |
+
cursor.execute('''
|
| 1118 |
+
DELETE FROM document_collections
|
| 1119 |
+
WHERE collection_id = ?
|
| 1120 |
+
''', (collection_id,))
|
| 1121 |
+
|
| 1122 |
+
# Then delete the collection itself
|
| 1123 |
+
cursor.execute('''
|
| 1124 |
+
DELETE FROM collections
|
| 1125 |
+
WHERE id = ?
|
| 1126 |
+
''', (collection_id,))
|
| 1127 |
+
|
| 1128 |
+
conn.commit()
|
| 1129 |
+
return True
|
| 1130 |
+
|
| 1131 |
+
except sqlite3.Error as e:
|
| 1132 |
+
st.error(f"Error deleting collection: {e}")
|
| 1133 |
+
return False
|
| 1134 |
+
|
| 1135 |
def display_vector_store_info():
|
| 1136 |
"""
|
| 1137 |
Display information about the current vector store state.
|
|
|
|
| 1178 |
st.error(traceback.format_exc())
|
| 1179 |
|
| 1180 |
|
| 1181 |
+
def process_and_store_document(uploaded_file) -> Optional[int]:
|
| 1182 |
+
"""
|
| 1183 |
+
Process an uploaded document and store it in the database.
|
| 1184 |
+
|
| 1185 |
+
Args:
|
| 1186 |
+
uploaded_file: Streamlit's UploadedFile object
|
| 1187 |
+
|
| 1188 |
+
Returns:
|
| 1189 |
+
Optional[int]: The ID of the stored document if successful, None otherwise
|
| 1190 |
+
"""
|
| 1191 |
+
try:
|
| 1192 |
+
# Create a temporary file to store the uploaded content
|
| 1193 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
| 1194 |
+
tmp_file.write(uploaded_file.getvalue())
|
| 1195 |
+
tmp_file.flush()
|
| 1196 |
+
|
| 1197 |
+
# Load and process the PDF
|
| 1198 |
+
loader = PyPDFLoader(tmp_file.name)
|
| 1199 |
+
documents = loader.load()
|
| 1200 |
+
|
| 1201 |
+
# Create text splitter for processing
|
| 1202 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 1203 |
+
chunk_size=1000,
|
| 1204 |
+
chunk_overlap=200,
|
| 1205 |
+
length_function=len,
|
| 1206 |
+
separators=["\n\n", "\n", " ", ""]
|
| 1207 |
+
)
|
| 1208 |
+
|
| 1209 |
+
# Split documents into chunks
|
| 1210 |
+
chunks = text_splitter.split_documents(documents)
|
| 1211 |
+
|
| 1212 |
+
# Extract full content for database storage
|
| 1213 |
+
full_content = "\n".join(doc.page_content for doc in documents)
|
| 1214 |
+
|
| 1215 |
+
# Store in database
|
| 1216 |
+
with st.session_state.db_conn as conn:
|
| 1217 |
+
cursor = conn.cursor()
|
| 1218 |
+
|
| 1219 |
+
# Insert document
|
| 1220 |
+
cursor.execute('''
|
| 1221 |
+
INSERT INTO documents (name, content, upload_date)
|
| 1222 |
+
VALUES (?, ?, ?)
|
| 1223 |
+
''', (uploaded_file.name, full_content, datetime.now()))
|
| 1224 |
+
|
| 1225 |
+
# Get the document ID
|
| 1226 |
+
document_id = cursor.lastrowid
|
| 1227 |
+
|
| 1228 |
+
conn.commit()
|
| 1229 |
+
|
| 1230 |
+
return document_id
|
| 1231 |
+
|
| 1232 |
+
except Exception as e:
|
| 1233 |
+
st.error(f"Error processing document {uploaded_file.name}: {str(e)}")
|
| 1234 |
+
import traceback
|
| 1235 |
+
st.error(traceback.format_exc())
|
| 1236 |
+
return None
|
| 1237 |
+
finally:
|
| 1238 |
+
# Clean up temporary file
|
| 1239 |
+
import os
|
| 1240 |
+
try:
|
| 1241 |
+
os.unlink(tmp_file.name)
|
| 1242 |
+
except:
|
| 1243 |
+
pass
|
| 1244 |
+
|
| 1245 |
+
def get_document_content(conn: sqlite3.Connection, document_id: int) -> Optional[str]:
|
| 1246 |
+
"""
|
| 1247 |
+
Retrieve the content of a specific document.
|
| 1248 |
+
|
| 1249 |
+
Args:
|
| 1250 |
+
conn: Database connection
|
| 1251 |
+
document_id: ID of the document to retrieve
|
| 1252 |
+
|
| 1253 |
+
Returns:
|
| 1254 |
+
Optional[str]: The document content if found, None otherwise
|
| 1255 |
+
"""
|
| 1256 |
+
try:
|
| 1257 |
+
cursor = conn.cursor()
|
| 1258 |
+
cursor.execute('''
|
| 1259 |
+
SELECT content
|
| 1260 |
+
FROM documents
|
| 1261 |
+
WHERE id = ?
|
| 1262 |
+
''', (document_id,))
|
| 1263 |
+
|
| 1264 |
+
result = cursor.fetchone()
|
| 1265 |
+
return result[0] if result else None
|
| 1266 |
+
|
| 1267 |
+
except sqlite3.Error as e:
|
| 1268 |
+
st.error(f"Error retrieving document content: {e}")
|
| 1269 |
+
return None
|
| 1270 |
+
|
| 1271 |
def initialize_qa_system(vector_store):
|
| 1272 |
"""
|
| 1273 |
Initialize QA system with optimized retrieval.
|