Spaces:

Jagukumar
/

Dynamic-chatbot-using-RAG

Build error

App Files Files Community

Jagukumar commited on Nov 27, 2024

Commit

74a5040

verified ·

1 Parent(s): 2a212a4

Upload 4 files

Browse files

Files changed (4) hide show

app.py +88 -0
pin.py +170 -0
processing.py +119 -0
requirements.txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from processing import extract_text, preprocess_text_generalized
+from pin import initialize_pinecone, handle_file_upload, query_pinecone, get_openai_embeddings
+import gradio as gr
+from dotenv import load_dotenv
+import os
+import openai
+# Load environment variables
+load_dotenv()
+# OpenAI and Pinecone settings
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+INDEX_NAME = "document-embeddings"
+EMBEDDING_DIMENSION = 1536  # OpenAI embeddings dimension for `text-embedding-ada-002`
+CLOUD = "aws"
+REGION = "us-east-1"
+# Set OpenAI API key
+openai.api_key = OPENAI_API_KEY
+def generate_response(user_query, pinecone_index, namespace="default", model="gpt-3.5-turbo"):
+    """
+    Generate a response to the user's query using OpenAI GPT and Pinecone for context retrieval.
+    """
+    # Step 1: Generate query embedding
+    query_embedding = get_openai_embeddings(user_query)
+    if query_embedding is None:
+        return "Error generating query embedding. Please try again."
+    # Step 2: Retrieve context from Pinecone
+    matches = query_pinecone(pinecone_index, query_embedding, namespace=namespace, top_k=5)
+    context = " ".join([match["metadata"].get("text", "") for match in matches])
+    # Step 3: Create prompt
+    if context.strip():
+        prompt = f"Context: {context}\n\nQuestion: {user_query}\n\nAnswer:"
+    else:
+        # No relevant context found, use a general-purpose prompt
+        prompt = f"Question: {user_query}\n\nAnswer:"
+    # Step 4: Generate response using OpenAI GPT
+    try:
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant capable of answering general questions and questions based on provided context."},
+                {"role": "user", "content": prompt}
+            ]
+        )
+        return response["choices"][0]["message"]["content"]
+    except Exception as e:
+        return f"Error generating response: {e}"
+# Gradio UI for chatbot
+def handle_user_query(file, user_query):
+    """
+    Handles the entire pipeline: dynamically process new file uploads,
+    update embeddings in Pinecone, and generate responses for user queries.
+    """
+    namespace = "user_session"
+    pinecone_index = initialize_pinecone(
+        api_key=PINECONE_API_KEY,
+        index_name=INDEX_NAME,
+        dimension=EMBEDDING_DIMENSION,
+        cloud=CLOUD,
+        region=REGION,
+    )
+    # Process the uploaded file dynamically
+    if file:
+        handle_file_upload(file.name, pinecone_index, namespace=namespace)
+    # Generate response for the user's query
+    return generate_response(user_query, pinecone_index, namespace=namespace)
+with gr.Blocks() as ui:
+    gr.Markdown("# Dynamic Chatbot with Retrieval-Augmented Generation (RAG)")
+    file_input = gr.File(label="Upload Document", file_types=[".pdf", ".csv", ".json"])
+    user_query = gr.Textbox(label="Your Query", placeholder="Ask a question...")
+    chatbot_response = gr.Textbox(label="Chatbot Response", interactive=False)
+    submit_button = gr.Button("Submit")
+    submit_button.click(handle_user_query, inputs=[file_input, user_query], outputs=chatbot_response)
+if __name__ == "__main__":
+    ui.launch()

pin.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import time
+from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
+import openai
+import hashlib
+from processing import extract_text, preprocess_text_generalized
+# Load environment variables from .env file
+load_dotenv()
+# Get Pinecone and OpenAI API keys from .env
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+INDEX_NAME = "document-embeddings"
+EMBEDDING_DIMENSION = 1536  # OpenAI's embeddings dimension for `text-embedding-ada-002`
+CLOUD = "aws"
+REGION = "us-east-1"
+# Set OpenAI API key
+openai.api_key = OPENAI_API_KEY
+# Initialize Pinecone
+def initialize_pinecone(api_key, index_name, dimension, cloud="aws", region="us-east-1"):
+    """
+    Initializes Pinecone and creates an index if it doesn't exist.
+    """
+    # Create a Pinecone client instance
+    pc = Pinecone(api_key=api_key)
+    # Check if the index exists; if not, create it
+    if index_name not in pc.list_indexes().names():
+        print(f"Index '{index_name}' does not exist. Creating a new index...")
+        pc.create_index(
+            name=index_name,
+            dimension=dimension,
+            metric="cosine",
+            spec=ServerlessSpec(cloud=cloud, region=region)
+        )
+    # Wait for the index to be ready
+    while not pc.describe_index(index_name).status["ready"]:
+        print("Waiting for index to be ready...")
+        time.sleep(1)
+    # Return the Pinecone Index object
+    return pc.Index(index_name)
+# Save embeddings to Pinecone vector DB
+from pinecone.core.openapi.shared.exceptions import NotFoundException
+def save_embeddings_to_pinecone(index, embeddings, metadata, namespace="default"):
+    """
+    Save embeddings to Pinecone. Clears old embeddings if they exist.
+    """
+    try:
+        # Check if the namespace exists before attempting deletion
+        index_description = index.describe_index_stats()
+        if namespace in index_description.get("namespaces", {}):
+            index.delete(delete_all=True, namespace=namespace)
+            print(f"Cleared all previous embeddings in namespace: {namespace}")
+        else:
+            print(f"Namespace '{namespace}' not found. Proceeding to save new embeddings.")
+    except Exception as e:
+        print(f"Error while checking/deleting embeddings in namespace {namespace}: {e}")
+    if embeddings:
+        vectors = [
+            {"id": f"doc_{i}", "values": embedding, "metadata": metadata}
+            for i, embedding in enumerate(embeddings)
+        ]
+        index.upsert(vectors=vectors, namespace=namespace)
+        print(f"Saved embeddings to namespace: {namespace}")
+    else:
+        print("No embeddings to save. Skipping upsert operation.")
+# Generate embeddings using OpenAI API
+def get_openai_embeddings(text, model="text-embedding-ada-002"):
+    """
+    Generate embeddings for a given text using OpenAI's embedding model.
+    Handles splitting text into chunks if it exceeds the token limit.
+    """
+    max_tokens = 8192  # Adjust based on the model's maximum token limit
+    try:
+        # Split text into smaller chunks
+        chunks = [text[i:i + max_tokens] for i in range(0, len(text), max_tokens)]
+        embeddings = []
+        for chunk in chunks:
+            response = openai.Embedding.create(input=chunk, model=model)
+            embeddings.extend([embedding["embedding"] for embedding in response["data"]])
+        return embeddings
+    except Exception as e:
+        print(f"Error generating embeddings with OpenAI API: {e}")
+        return None
+# Query Pinecone for relevant embeddings
+def query_pinecone(index, query_embedding, namespace="default", top_k=3):
+    """
+    Retrieve relevant embeddings from Pinecone using similarity search.
+    """
+    results = index.query(
+        vector=query_embedding,
+        namespace=namespace,
+        top_k=top_k,
+        include_metadata=True
+    )
+    return results["matches"]  # Returns the top-k matches with metadata
+# Pipeline for handling file uploads and updating Pinecone vector DB
+# Global variable to track the previous file hash
+previous_file_hash = None
+def calculate_file_hash(file_path):
+    """
+    Calculate a hash for the uploaded file to uniquely identify it.
+    """
+    hasher = hashlib.md5()
+    with open(file_path, "rb") as f:
+        while chunk := f.read(8192):
+            hasher.update(chunk)
+    return hasher.hexdigest()
+def handle_file_upload(file_path, pinecone_index, namespace="default"):
+    """
+    Handle the process of uploading a file, clearing old embeddings,
+    and saving new embeddings dynamically.
+    """
+    global previous_file_hash
+    current_file_hash = calculate_file_hash(file_path)
+    if current_file_hash == previous_file_hash:
+        print(f"File '{file_path}' is identical to the previously uploaded file. Skipping processing.")
+        return
+    try:
+        text = extract_text(file_path)
+        processed_text = preprocess_text_generalized(text)
+        # Generate embeddings
+        embeddings = get_openai_embeddings(processed_text)
+        if embeddings:
+            metadata = {"file_name": os.path.basename(file_path), "text": processed_text}
+            save_embeddings_to_pinecone(pinecone_index, embeddings, metadata, namespace)
+            previous_file_hash = current_file_hash
+        else:
+            print("Failed to generate embeddings. Skipping save operation.")
+    except Exception as e:
+        print(f"Error processing file upload: {e}")
+# Example usage
+if __name__ == "__main__":
+    # Initialize Pinecone with serverless specifications
+    pinecone_index = initialize_pinecone(
+        api_key=PINECONE_API_KEY,
+        index_name=INDEX_NAME,
+        dimension=EMBEDDING_DIMENSION,
+        cloud=CLOUD,
+        region=REGION
+    )

processing.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import mimetypes
+import pandas as pd
+import PyPDF2
+import json
+import re
+import spacy
+import os
+from dotenv import load_dotenv
+import openai
+import numpy as np
+# Load environment variables
+load_dotenv()
+# Set OpenAI API Key
+openai.api_key = os.getenv("OPENAI_API_KEY")
+# Load SpaCy model
+nlp = spacy.load("en_core_web_sm")
+# Detect file type
+def detect_file_type(file_path):
+    file_type = mimetypes.guess_type(file_path)[0]
+    if file_type in ["application/pdf"]:
+        return "pdf"
+    elif file_type in ["text/csv", "application/vnd.ms-excel"]:
+        return "csv"
+    elif file_type == "application/json":
+        return "json"
+    else:
+        raise ValueError(f"Unsupported file format: {file_type}")
+# Extract text from CSV
+def extract_text_from_csv(file_path):
+    df = pd.read_csv(file_path)
+    text = " ".join(df.astype(str).stack())
+    return text
+# Extract text from PDF
+def extract_text_from_pdf(file_path):
+    pdf_reader = PyPDF2.PdfReader(file_path)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+# Extract text from JSON
+def extract_text_from_json(file_path):
+    def recursive_text_extraction(data):
+        if isinstance(data, dict):
+            return " ".join(recursive_text_extraction(value) for value in data.values())
+        elif isinstance(data, list):
+            return " ".join(recursive_text_extraction(item) for item in data)
+        else:
+            return str(data)
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return recursive_text_extraction(data)
+# Generalized text extraction
+def extract_text(file_path):
+    file_type = detect_file_type(file_path)
+    if file_type == "csv":
+        return extract_text_from_csv(file_path)
+    elif file_type == "pdf":
+        return extract_text_from_pdf(file_path)
+    elif file_type == "json":
+        return extract_text_from_json(file_path)
+    else:
+        raise ValueError("Unsupported file format")
+# Preprocess text
+def preprocess_text_generalized(text):
+    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
+    text = re.sub(r"[^\x20-\x7E]", "", text)  # Remove non-ASCII characters
+    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
+    chunk_size = 100000  # Maximum chunk size
+    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    processed_chunks = []
+    for chunk in chunks:
+        doc = nlp(chunk.lower())
+        tokens = [
+            token.lemma_
+            for token in doc
+            if not token.is_stop and token.is_alpha
+        ]
+        processed_chunks.append(" ".join(tokens))
+    processed_text = " ".join(processed_chunks)
+    return processed_text
+# Generate embeddings using OpenAI API
+def get_openai_embeddings(text, model="text-embedding-ada-002"):
+    """
+    Generate embeddings for a given text using OpenAI API.
+    """
+    try:
+        response = openai.Embedding.create(input=text, model=model)
+        embeddings = response["data"][0]["embedding"]
+        return np.array(embeddings)  # Convert to NumPy array for compatibility
+    except Exception as e:
+        print(f"Error generating embeddings: {e}")
+        return None
+# Example usage
+if __name__ == "__main__":
+    # Example file path
+    file_path = "example.pdf"
+    # Extract and preprocess text
+    raw_text = extract_text(file_path)
+    preprocessed_text = preprocess_text_generalized(raw_text)
+    # Generate embeddings using OpenAI API
+    embeddings = get_openai_embeddings(preprocessed_text)
+    if embeddings is not None:
+        print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
+    else:
+        print("Failed to generate embeddings.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+transformers
+gradio
+pandas
+PyPDF2
+ipykernel
+spacy
+torch
+pinecone
+python-dotenv
+json5
+accelerate==0.26.0
+openai==0.28