Spaces:

anasmkh
/

QdrantVectorStore_Llamaindex

Sleeping

App Files Files Community

anasmkh commited on Feb 13, 2025

Commit

6863650

verified ·

1 Parent(s): fdd2048

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -28

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import shutil
 import gradio as gr
 import qdrant_client
 from getpass import getpass
@@ -33,13 +34,14 @@ client = None
 vector_store = None
 storage_context = None
-# Use a persistent folder to keep uploaded files.
 upload_dir = "uploaded_files"
 if not os.path.exists(upload_dir):
     os.makedirs(upload_dir)
-# A set to track which files have already been processed.
-processed_files = set()
 # -------------------------------------------------------
 # Function to process uploaded files and update the index.
@@ -47,45 +49,66 @@ processed_files = set()
 def process_upload(files):
     """
     Accepts a list of uploaded file paths, saves them to a persistent folder,
-    loads only new documents, and builds (or updates) the vector index and chat engine.
     """
-    global client, vector_store, storage_context, index, query_engine, memory, chat_engine, processed_files
     new_file_paths = []
-    # Loop over each uploaded file.
     for file_path in files:
         file_name = os.path.basename(file_path)
         dest = os.path.join(upload_dir, file_name)
-        # If the file is not already in our folder, copy it.
-        if file_name not in processed_files:
-            if not os.path.exists(dest):
-                shutil.copy(file_path, dest)
             new_file_paths.append(dest)
-            processed_files.add(file_name)
     if not new_file_paths:
         return "No new documents to add."
     # Load only the new documents.
     new_documents = SimpleDirectoryReader(input_files=new_file_paths).load_data()
-    # If this is the first upload, build the index from scratch.
     if index is None:
-        # (Here we use an in-memory Qdrant client. Change ":memory:" to a persistent path if needed.)
-        client = qdrant_client.QdrantClient(location=":memory:")
-        vector_store = QdrantVectorStore(
-            collection_name="paper",
-            client=client,
-            enable_hybrid=True,
-            batch_size=20,
         )
-        storage_context = StorageContext.from_defaults(vector_store=vector_store)
-        index = VectorStoreIndex.from_documents(new_documents, storage_context=storage_context)
     else:
-        # Otherwise, insert the new documents into the existing index.
         index.insert_documents(new_documents)
-    # Reinitialize query and chat engines so they use the updated index.
     query_engine = index.as_query_engine(vector_store_query_mode="hybrid")
     memory = ChatMemoryBuffer.from_defaults(token_limit=3000)
     chat_engine = index.as_chat_engine(
@@ -104,15 +127,12 @@ def process_upload(files):
 # -------------------------------------------------------
 def chat_with_ai(user_input, chat_history):
     global chat_engine
-    # Check if the chat engine is initialized.
     if chat_engine is None:
         return chat_history, "Please upload documents first."
     response = chat_engine.chat(user_input)
     references = response.source_nodes
     ref = []
-    # Extract file names from the source nodes (if available)
     for node in references:
         file_name = node.metadata.get('file_name')
         if file_name and file_name not in ref:
@@ -135,9 +155,9 @@ def gradio_interface():
     with gr.Blocks() as demo:
         gr.Markdown("# Chat Interface for LlamaIndex with File Upload")
         with gr.Tab("Upload Documents"):
             gr.Markdown("Upload PDF, Excel, CSV, DOC/DOCX, or TXT files below:")
-            # The file upload widget: we specify allowed file types.
             file_upload = gr.File(
                 label="Upload Files",
                 file_count="multiple",

 import os
 import shutil
+import time
 import gradio as gr
 import qdrant_client
 from getpass import getpass
 vector_store = None
 storage_context = None
+# Define a persistent collection name.
+collection_name = "paper"
+# Use a persistent folder to store uploaded files.
 upload_dir = "uploaded_files"
 if not os.path.exists(upload_dir):
     os.makedirs(upload_dir)
+# We no longer clear the folder so previously uploaded files are retained.
 # -------------------------------------------------------
 # Function to process uploaded files and update the index.
 def process_upload(files):
     """
     Accepts a list of uploaded file paths, saves them to a persistent folder,
+    loads new documents, and builds or updates the vector index and chat engine.
     """
+    global client, vector_store, storage_context, index, query_engine, memory, chat_engine
+    # Copy files into the upload directory if not already present.
     new_file_paths = []
     for file_path in files:
         file_name = os.path.basename(file_path)
         dest = os.path.join(upload_dir, file_name)
+        if not os.path.exists(dest):
+            shutil.copy(file_path, dest)
             new_file_paths.append(dest)
+    # If no new files are uploaded, notify the user.
     if not new_file_paths:
         return "No new documents to add."
     # Load only the new documents.
     new_documents = SimpleDirectoryReader(input_files=new_file_paths).load_data()
+    # Initialize a persistent Qdrant client.
+    client = qdrant_client.QdrantClient(
+        path="./qdrant_db",
+        prefer_grpc=True
+    )
+    # Ensure the collection exists.
+    from qdrant_client.http import models
+    existing_collections = {col.name for col in client.get_collections().collections}
+    if collection_name not in existing_collections:
+        client.create_collection(
+            collection_name=collection_name,
+            vectors_config=models.VectorParams(
+                size=1536,  # text-embedding-ada-002 produces 1536-dimensional vectors.
+                distance=models.Distance.COSINE
+            )
+        )
+        # Wait briefly for the collection creation to complete.
+        time.sleep(1)
+    # Initialize (or re-use) the vector store.
+    vector_store = QdrantVectorStore(
+        collection_name=collection_name,
+        client=client,
+        enable_hybrid=True,
+        batch_size=20,
+    )
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    # Build the index if it doesn't exist; otherwise, update it.
     if index is None:
+        index = VectorStoreIndex.from_documents(
+            SimpleDirectoryReader(upload_dir).load_data(),
+            storage_context=storage_context
         )
     else:
         index.insert_documents(new_documents)
+    # Reinitialize query and chat engines to reflect updates.
     query_engine = index.as_query_engine(vector_store_query_mode="hybrid")
     memory = ChatMemoryBuffer.from_defaults(token_limit=3000)
     chat_engine = index.as_chat_engine(
 # -------------------------------------------------------
 def chat_with_ai(user_input, chat_history):
     global chat_engine
     if chat_engine is None:
         return chat_history, "Please upload documents first."
     response = chat_engine.chat(user_input)
     references = response.source_nodes
     ref = []
     for node in references:
         file_name = node.metadata.get('file_name')
         if file_name and file_name not in ref:
     with gr.Blocks() as demo:
         gr.Markdown("# Chat Interface for LlamaIndex with File Upload")
+        # Use Tabs to separate the file upload and chat interfaces.
         with gr.Tab("Upload Documents"):
             gr.Markdown("Upload PDF, Excel, CSV, DOC/DOCX, or TXT files below:")
             file_upload = gr.File(
                 label="Upload Files",
                 file_count="multiple",