Spaces:

nazib61
/

qdarnt

Sleeping

App Files Files Community

nazib61 commited on Oct 6

Commit

a472bce

verified ·

1 Parent(s): 7401237

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -37

app.py CHANGED Viewed

@@ -3,70 +3,84 @@ from datasets import load_dataset
 from qdrant_client import QdrantClient, models
 from sentence_transformers import SentenceTransformer
 import torch # Ensure torch is imported
 # --- Configuration ---
-# Use ":memory:" for a temporary, in-memory database.
-# Or use a path like "./qdrant_db" to save the data to disk.
-# Using a path is better for Spaces as data will be rebuilt only when the code changes.
 QDRANT_PATH = "./qdrant_db"
 COLLECTION_NAME = "my_text_collection"
-MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
 # --- Load Model ---
-# Specify that the model should run on the CPU, which is standard for HF Spaces
 device = "cpu"
 model = SentenceTransformer(MODEL_NAME, device=device)
 # --- Qdrant Client and Collection Setup ---
-# Initialize Qdrant client to use a local, on-disk storage
-# This avoids the need to run a separate Qdrant server
 qdrant_client = QdrantClient(path=QDRANT_PATH)
 # Check if the collection already exists
 try:
     collection_info = qdrant_client.get_collection(collection_name=COLLECTION_NAME)
     print("Collection already exists.")
 except Exception as e:
-    print("Collection not found, creating a new one...")
-    # --- Load Dataset ---
-    # We only load the dataset and create embeddings if the collection doesn't exist
     dataset = load_dataset("ag_news", split="test")
-    # Limiting the dataset for a quicker demo setup
-    data = [item['text'] for item in dataset][:1000]
-    # Create the collection
     qdrant_client.create_collection(
         collection_name=COLLECTION_NAME,
-        vectors_config=models.VectorParams(size=model.get_sentence_embedding_dimension(), distance=models.Distance.COSINE),
     )
-    # --- Generate and Index Embeddings ---
     print("Generating and indexing embeddings...")
-    # This can take a moment on the first run
-    qdrant_client.add(
         collection_name=COLLECTION_NAME,
-        documents=data,
-        ids=list(range(len(data))), # Simple sequential IDs
-        embedding_model=model
     )
     print("Embeddings indexed successfully.")
 # --- Search Function ---
 def search_in_qdrant(query):
-    """
-    Takes a user query, generates its embedding, and searches in Qdrant.
-    """
     if not query:
         return "Please enter a search query."
-    # The client's search function can now take the model directly
     hits = qdrant_client.search(
         collection_name=COLLECTION_NAME,
-        query_text=query,
-        query_filter=None, # No filters for now
-        limit=5, # Return the top 5 most similar results
-        embedding_model=model
     )
     results_text = ""
@@ -74,23 +88,131 @@ def search_in_qdrant(query):
         return "No results found."
     for hit in hits:
-        results_text += f"**Score:** {hit.score:.4f}\n"
-        results_text += f"**Text:** {hit.payload['document']}\n\n" # Payload key is 'document' when using .add()
     return results_text
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown("# Semantic Search with Qdrant and Gradio")
     gr.Markdown("Enter a query to search for similar news articles from the AG News dataset.")
-    with gr.Row():
-        search_input = gr.Textbox(label="Search Query", placeholder="e.g., 'Latest news on space exploration'")
-    search_button = gr.Button("Search")
-    search_output = gr.Markdown()
-    search_button.click(search_in_qdrant, inputs=search_input, outputs=search_output)
 if __name__ == "__main__":
     demo.launch()

 from qdrant_client import QdrantClient, models
 from sentence_transformers import SentenceTransformer
 import torch # Ensure torch is imported
+import os
+import shutil
+import PyPDF2
+from docx import Document
+import pandas as pd
 # --- Configuration ---
 QDRANT_PATH = "./qdrant_db"
 COLLECTION_NAME = "my_text_collection"
+MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'  # Better model for semantic similarity
 # --- Load Model ---
 device = "cpu"
 model = SentenceTransformer(MODEL_NAME, device=device)
 # --- Qdrant Client and Collection Setup ---
 qdrant_client = QdrantClient(path=QDRANT_PATH)
 # Check if the collection already exists
+collection_exists = False
 try:
     collection_info = qdrant_client.get_collection(collection_name=COLLECTION_NAME)
     print("Collection already exists.")
+    collection_exists = True
 except Exception as e:
+    print(f"Collection not found: {e}, creating a new one...")
+    collection_exists = False
+# If collection doesn't exist, create it and populate with data
+if not collection_exists:
+    # Load dataset and convert to a simple list format
     dataset = load_dataset("ag_news", split="test")
+    # Convert dataset to pandas dataframe to properly access the text column
+    df = dataset.to_pandas()
+    data = df['text'].tolist()[:1000]  # Get first 1000 text entries
+    # Create the collection with proper vector configuration
+    # Use the correct vector size for the selected model
+    vector_size = model.get_sentence_embedding_dimension() or 768  # Get the actual embedding size of the model, default to 768 for mpnet
     qdrant_client.create_collection(
         collection_name=COLLECTION_NAME,
+        vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE),
     )
+    # Generate embeddings manually to ensure compatibility
     print("Generating and indexing embeddings...")
+    embeddings = model.encode(data)
+    # Prepare points for insertion
+    points = []
+    for i, (text, embedding) in enumerate(zip(data, embeddings)):
+        point = models.PointStruct(
+            id=i,
+            vector=embedding.tolist(),
+            payload={"document": text}
+        )
+        points.append(point)
+    # Upload points to the collection
+    qdrant_client.upsert(
         collection_name=COLLECTION_NAME,
+        points=points
     )
     print("Embeddings indexed successfully.")
 # --- Search Function ---
 def search_in_qdrant(query):
     if not query:
         return "Please enter a search query."
+    # Generate embedding for the query
+    query_embedding = model.encode([query])[0].tolist()
     hits = qdrant_client.search(
         collection_name=COLLECTION_NAME,
+        query_vector=query_embedding,
+        limit=5,
     )
     results_text = ""
         return "No results found."
     for hit in hits:
+        # Check if payload exists and has the document key
+        if hit.payload and 'document' in hit.payload:
+            results_text += f"**Score:** {hit.score:.4f}\n"
+            results_text += f"**Text:** {hit.payload['document']}\n\n"
+        else:
+            results_text += f"**Score:** {hit.score:.4f}\n"
+            results_text += f"**Text:** [No document content available]\n\n"
     return results_text
+# --- Upload Function ---
+def extract_text_from_file(file_path):
+    """Extract text from various file types"""
+    file_extension = file_path.lower().split('.')[-1]
+    if file_extension == 'txt':
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    elif file_extension == 'pdf':
+        text = ""
+        with open(file_path, 'rb') as f:
+            pdf_reader = PyPDF2.PdfReader(f)
+            for page in pdf_reader.pages:
+                text += page.extract_text() + "\n"
+        return text
+    elif file_extension in ['docx', 'doc']:
+        doc = Document(file_path)
+        text = ""
+        for paragraph in doc.paragraphs:
+            text += paragraph.text + "\n"
+        return text
+    elif file_extension in ['csv', 'xlsx', 'xls']:
+        if file_extension == 'csv':
+            df = pd.read_csv(file_path)
+        else:
+            df = pd.read_excel(file_path)
+        # Convert the entire dataframe to text
+        return df.to_string()
+    else:
+        # Try to read as plain text
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except UnicodeDecodeError:
+            # If UTF-8 fails, try with different encoding
+            try:
+                with open(file_path, 'r', encoding='latin-1') as f:
+                    return f.read()
+            except:
+                return "Could not read file: unsupported format or encoding issue"
+def upload_to_qdrant(text_content, file_upload=None):
+    if not text_content and not file_upload:
+        return "Please provide text content or upload a file."
+    documents_to_add = []
+    # Add text content if provided
+    if text_content:
+        documents_to_add.append(text_content)
+    # Process uploaded file if provided
+    if file_upload:
+        try:
+            content = extract_text_from_file(file_upload.name)
+            documents_to_add.append(content)
+        except Exception as e:
+            return f"Error reading file: {str(e)}"
+    if not documents_to_add:
+        return "No content to upload."
+    # Get the next available ID by checking the current max ID in the collection
+    # For simplicity, we'll just get the count of existing records and start from there
+    max_id = 0  # Default to 0 if we can't get the count
+    try:
+        collection_info = qdrant_client.get_collection(collection_name=COLLECTION_NAME)
+        if hasattr(collection_info, 'points_count') and collection_info.points_count is not None:
+            current_count = collection_info.points_count
+            max_id = current_count  # Start from the current count
+    except:
+        max_id = 0  # If there's an error, start with 0
+    # Generate embeddings for the new documents
+    embeddings = model.encode(documents_to_add)
+    # Prepare points for insertion
+    points = []
+    for i, (doc, embedding) in enumerate(zip(documents_to_add, embeddings)):
+        point_id = max_id + i + 1  # IDs will be automatically converted as needed by Qdrant
+        point = models.PointStruct(
+            id=point_id,
+            vector=embedding.tolist(),
+            payload={"document": doc}
+        )
+        points.append(point)
+    # Upload points to the collection
+    qdrant_client.upsert(
+        collection_name=COLLECTION_NAME,
+        points=points
+    )
+    return f"Successfully added {len(documents_to_add)} document(s) to the collection."
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown("# Semantic Search with Qdrant and Gradio")
     gr.Markdown("Enter a query to search for similar news articles from the AG News dataset.")
+    with gr.Tab("Search"):
+        with gr.Row():
+            search_input = gr.Textbox(label="Search Query", placeholder="e.g., 'Latest news on space exploration'")
+        search_button = gr.Button("Search")
+        search_output = gr.Markdown()
+        search_button.click(search_in_qdrant, inputs=search_input, outputs=search_output)
+    with gr.Tab("Upload"):
+        with gr.Row():
+            text_input = gr.Textbox(label="Text Content", placeholder="Enter text to add to the collection", lines=5)
+        with gr.Row():
+            file_input = gr.File(label="Or Upload a File", file_types=['.txt', '.pdf', '.docx', '.csv', '.xlsx', '.xls', '.md'])
+        upload_button = gr.Button("Upload to Collection")
+        upload_output = gr.Textbox(label="Upload Status", interactive=False)
+        upload_button.click(upload_to_qdrant, inputs=[text_input, file_input], outputs=upload_output)
 if __name__ == "__main__":
     demo.launch()