Spaces:

anasmkh
/

QdrantVectorStore_Llamaindex

Sleeping

App Files Files Community

anasmkh commited on Feb 13, 2025

Commit

c9eadbe

verified ·

1 Parent(s): 4e3419a

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -38

app.py CHANGED Viewed

@@ -4,10 +4,12 @@ import gradio as gr
 import qdrant_client
 from getpass import getpass
 openai_api_key = os.getenv('OPENAI_API_KEY')
 from llama_index.llms.openai import OpenAI
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.core import Settings
@@ -15,12 +17,14 @@ from llama_index.core import Settings
 Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.4)
 Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
 from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
 from llama_index.vector_stores.qdrant import QdrantVectorStore
 from llama_index.core.memory import ChatMemoryBuffer
 chat_engine = None
 index = None
 query_engine = None
@@ -29,61 +33,85 @@ client = None
 vector_store = None
 storage_context = None
 def process_upload(files):
     upload_dir = "uploaded_files"
     if not os.path.exists(upload_dir):
         os.makedirs(upload_dir)
-    else:
-        for f in os.listdir(upload_dir):
-            os.remove(os.path.join(upload_dir, f))
     for file_path in files:
         file_name = os.path.basename(file_path)
         dest = os.path.join(upload_dir, file_name)
-        shutil.copy(file_path, dest)
-    documents = SimpleDirectoryReader(upload_dir).load_data()
     global client, vector_store, storage_context, index, query_engine, memory, chat_engine
-    client = qdrant_client.QdrantClient(
-        path="./qdrant_db",
-        prefer_grpc=True
-    )
     existing_collections = {col.name for col in client.get_collections().collections}
     if collection_name not in existing_collections:
         client.create_collection(
             collection_name=collection_name,
             vectors_config=models.VectorParams(
-                size=1536,
                 distance=models.Distance.COSINE
             )
         )
-    vector_store = QdrantVectorStore(
-        collection_name=collection_name,
-        client=client,
-        enable_hybrid=True,
-        batch_size=20,
-    )
-    storage_context = StorageContext.from_defaults(vector_store=vector_store)
-    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
-    query_engine = index.as_query_engine(vector_store_query_mode="hybrid")
     memory = ChatMemoryBuffer.from_defaults(token_limit=3000)
     chat_engine = index.as_chat_engine(
         chat_mode="context",
         memory=memory,
-        system_prompt=(
-            "You are an AI assistant who answers the user questions,"
-        ),
     )
-    return "Documents uploaded and index built successfully!"
 def chat_with_ai(user_input, chat_history):
     global chat_engine
@@ -92,25 +120,24 @@ def chat_with_ai(user_input, chat_history):
     response = chat_engine.chat(user_input)
     references = response.source_nodes
-    ref, pages = [], []
     for node in references:
         file_name = node.metadata.get('file_name')
         if file_name and file_name not in ref:
             ref.append(file_name)
     complete_response = str(response) + "\n\n"
-    if ref or pages:
         chat_history.append((user_input, complete_response))
     else:
         chat_history.append((user_input, str(response)))
     return chat_history, ""
 def clear_history():
     return [], ""
 def gradio_interface():
     with gr.Blocks() as demo:
         gr.Markdown("# AI Assistant")
@@ -118,10 +145,11 @@ def gradio_interface():
         with gr.Tab("Upload Documents"):
             gr.Markdown("Upload PDF, Excel, CSV, DOC/DOCX, or TXT files below:")
             file_upload = gr.File(
-                    label="Upload Files",
-                    file_count="multiple",
-                    file_types=[".pdf", ".csv", ".txt", ".xlsx", ".xls", ".doc", ".docx"],
-                    type="filepath" )
             upload_status = gr.Textbox(label="Upload Status", interactive=False)
             upload_button = gr.Button("Process Upload")
@@ -144,5 +172,4 @@ def gradio_interface():
     return demo
 gradio_interface().launch(debug=True)

 import qdrant_client
 from getpass import getpass
+# Set your OpenAI API key from environment variables.
 openai_api_key = os.getenv('OPENAI_API_KEY')
+# -------------------------------------------------------
+# Configure LlamaIndex with OpenAI LLM and Embeddings
+# -------------------------------------------------------
 from llama_index.llms.openai import OpenAI
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.core import Settings
 Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.4)
 Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
+# -------------------------------------------------------
+# Import document readers, index, vector store, memory, etc.
+# -------------------------------------------------------
 from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
 from llama_index.vector_stores.qdrant import QdrantVectorStore
 from llama_index.core.memory import ChatMemoryBuffer
+# Global variables to hold persistent objects.
 chat_engine = None
 index = None
 query_engine = None
 vector_store = None
 storage_context = None
+# Define a global collection name (you can change this as needed)
+collection_name = "paper"
 def process_upload(files):
+    """
+    Process newly uploaded files by copying them into a persistent folder,
+    loading their content, and then either building a new index or inserting
+    new documents into the existing index.
+    """
     upload_dir = "uploaded_files"
+    # Create the upload folder if it does not exist.
     if not os.path.exists(upload_dir):
         os.makedirs(upload_dir)
+    # Copy new files into the upload directory.
+    new_file_paths = []
     for file_path in files:
         file_name = os.path.basename(file_path)
         dest = os.path.join(upload_dir, file_name)
+        # Copy the file if it doesn't already exist.
+        if not os.path.exists(dest):
+            shutil.copy(file_path, dest)
+        new_file_paths.append(dest)
+    # Load only the newly uploaded documents.
+    # (SimpleDirectoryReader can accept a list of file paths via the 'input_files' parameter.)
+    documents = SimpleDirectoryReader(input_files=new_file_paths).load_data()
     global client, vector_store, storage_context, index, query_engine, memory, chat_engine
+    # Initialize Qdrant client if not already done.
+    if client is None:
+        client = qdrant_client.QdrantClient(
+            path="./qdrant_db",
+            prefer_grpc=True
+        )
+    # Ensure the collection exists.
+    from qdrant_client.http import models
     existing_collections = {col.name for col in client.get_collections().collections}
     if collection_name not in existing_collections:
         client.create_collection(
             collection_name=collection_name,
             vectors_config=models.VectorParams(
+                size=1536,  # OpenAI's text-embedding-ada-002 produces 1536-d vectors.
                 distance=models.Distance.COSINE
             )
         )
+    # Initialize the vector store if not already done.
+    if vector_store is None:
+        vector_store = QdrantVectorStore(
+            collection_name=collection_name,
+            client=client,
+            enable_hybrid=True,
+            batch_size=20,
+        )
+    # Initialize storage context if not already done.
+    if storage_context is None:
+        storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    # If no index exists yet, create one from the documents.
+    if index is None:
+        index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
+    else:
+        # Append the new documents to the existing index.
+        index.insert_documents(documents)
+    # (Optional) Reinitialize the query and chat engines so they reflect the updated index.
+    query_engine = index.as_query_engine(vector_store_query_mode="hybrid")
     memory = ChatMemoryBuffer.from_defaults(token_limit=3000)
     chat_engine = index.as_chat_engine(
         chat_mode="context",
         memory=memory,
+        system_prompt="You are an AI assistant who answers the user questions,"
     )
+    return "Documents uploaded and index updated successfully!"
 def chat_with_ai(user_input, chat_history):
     global chat_engine
     response = chat_engine.chat(user_input)
     references = response.source_nodes
+    ref = []
+    # Extract referenced file names from the response.
     for node in references:
         file_name = node.metadata.get('file_name')
         if file_name and file_name not in ref:
             ref.append(file_name)
     complete_response = str(response) + "\n\n"
+    if ref:
         chat_history.append((user_input, complete_response))
     else:
         chat_history.append((user_input, str(response)))
     return chat_history, ""
 def clear_history():
     return [], ""
 def gradio_interface():
     with gr.Blocks() as demo:
         gr.Markdown("# AI Assistant")
         with gr.Tab("Upload Documents"):
             gr.Markdown("Upload PDF, Excel, CSV, DOC/DOCX, or TXT files below:")
             file_upload = gr.File(
+                label="Upload Files",
+                file_count="multiple",
+                file_types=[".pdf", ".csv", ".txt", ".xlsx", ".xls", ".doc", ".docx"],
+                type="filepath"  # Returns file paths.
+            )
             upload_status = gr.Textbox(label="Upload Status", interactive=False)
             upload_button = gr.Button("Process Upload")
     return demo
 gradio_interface().launch(debug=True)