Spaces:

Jagukumar
/

Dynamic-chatbot-using-RAG

Build error

App Files Files Community

Jagukumar commited on Nov 27, 2024

Commit

e582eae

verified ·

1 Parent(s): d642926

Update pin.py

Browse files

Files changed (1) hide show

pin.py +170 -170

pin.py CHANGED Viewed

@@ -1,170 +1,170 @@
-import os
-import time
-from dotenv import load_dotenv
-from pinecone import Pinecone, ServerlessSpec
-import openai
-import hashlib
-from processing import extract_text, preprocess_text_generalized
-# Load environment variables from .env file
-load_dotenv()
-# Get Pinecone and OpenAI API keys from .env
-PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-INDEX_NAME = "document-embeddings"
-EMBEDDING_DIMENSION = 1536  # OpenAI's embeddings dimension for `text-embedding-ada-002`
-CLOUD = "aws"
-REGION = "us-east-1"
-# Set OpenAI API key
-openai.api_key = OPENAI_API_KEY
-# Initialize Pinecone
-def initialize_pinecone(api_key, index_name, dimension, cloud="aws", region="us-east-1"):
-    """
-    Initializes Pinecone and creates an index if it doesn't exist.
-    """
-    # Create a Pinecone client instance
-    pc = Pinecone(api_key=api_key)
-    # Check if the index exists; if not, create it
-    if index_name not in pc.list_indexes().names():
-        print(f"Index '{index_name}' does not exist. Creating a new index...")
-        pc.create_index(
-            name=index_name,
-            dimension=dimension,
-            metric="cosine",
-            spec=ServerlessSpec(cloud=cloud, region=region)
-        )
-    # Wait for the index to be ready
-    while not pc.describe_index(index_name).status["ready"]:
-        print("Waiting for index to be ready...")
-        time.sleep(1)
-    # Return the Pinecone Index object
-    return pc.Index(index_name)
-# Save embeddings to Pinecone vector DB
-from pinecone.core.openapi.shared.exceptions import NotFoundException
-def save_embeddings_to_pinecone(index, embeddings, metadata, namespace="default"):
-    """
-    Save embeddings to Pinecone. Clears old embeddings if they exist.
-    """
-    try:
-        # Check if the namespace exists before attempting deletion
-        index_description = index.describe_index_stats()
-        if namespace in index_description.get("namespaces", {}):
-            index.delete(delete_all=True, namespace=namespace)
-            print(f"Cleared all previous embeddings in namespace: {namespace}")
-        else:
-            print(f"Namespace '{namespace}' not found. Proceeding to save new embeddings.")
-    except Exception as e:
-        print(f"Error while checking/deleting embeddings in namespace {namespace}: {e}")
-    if embeddings:
-        vectors = [
-            {"id": f"doc_{i}", "values": embedding, "metadata": metadata}
-            for i, embedding in enumerate(embeddings)
-        ]
-        index.upsert(vectors=vectors, namespace=namespace)
-        print(f"Saved embeddings to namespace: {namespace}")
-    else:
-        print("No embeddings to save. Skipping upsert operation.")
-# Generate embeddings using OpenAI API
-def get_openai_embeddings(text, model="text-embedding-ada-002"):
-    """
-    Generate embeddings for a given text using OpenAI's embedding model.
-    Handles splitting text into chunks if it exceeds the token limit.
-    """
-    max_tokens = 8192  # Adjust based on the model's maximum token limit
-    try:
-        # Split text into smaller chunks
-        chunks = [text[i:i + max_tokens] for i in range(0, len(text), max_tokens)]
-        embeddings = []
-        for chunk in chunks:
-            response = openai.Embedding.create(input=chunk, model=model)
-            embeddings.extend([embedding["embedding"] for embedding in response["data"]])
-        return embeddings
-    except Exception as e:
-        print(f"Error generating embeddings with OpenAI API: {e}")
-        return None
-# Query Pinecone for relevant embeddings
-def query_pinecone(index, query_embedding, namespace="default", top_k=3):
-    """
-    Retrieve relevant embeddings from Pinecone using similarity search.
-    """
-    results = index.query(
-        vector=query_embedding,
-        namespace=namespace,
-        top_k=top_k,
-        include_metadata=True
-    )
-    return results["matches"]  # Returns the top-k matches with metadata
-# Pipeline for handling file uploads and updating Pinecone vector DB
-# Global variable to track the previous file hash
-previous_file_hash = None
-def calculate_file_hash(file_path):
-    """
-    Calculate a hash for the uploaded file to uniquely identify it.
-    """
-    hasher = hashlib.md5()
-    with open(file_path, "rb") as f:
-        while chunk := f.read(8192):
-            hasher.update(chunk)
-    return hasher.hexdigest()
-def handle_file_upload(file_path, pinecone_index, namespace="default"):
-    """
-    Handle the process of uploading a file, clearing old embeddings,
-    and saving new embeddings dynamically.
-    """
-    global previous_file_hash
-    current_file_hash = calculate_file_hash(file_path)
-    if current_file_hash == previous_file_hash:
-        print(f"File '{file_path}' is identical to the previously uploaded file. Skipping processing.")
-        return
-    try:
-        text = extract_text(file_path)
-        processed_text = preprocess_text_generalized(text)
-        # Generate embeddings
-        embeddings = get_openai_embeddings(processed_text)
-        if embeddings:
-            metadata = {"file_name": os.path.basename(file_path), "text": processed_text}
-            save_embeddings_to_pinecone(pinecone_index, embeddings, metadata, namespace)
-            previous_file_hash = current_file_hash
-        else:
-            print("Failed to generate embeddings. Skipping save operation.")
-    except Exception as e:
-        print(f"Error processing file upload: {e}")
-# Example usage
-if __name__ == "__main__":
-    # Initialize Pinecone with serverless specifications
-    pinecone_index = initialize_pinecone(
-        api_key=PINECONE_API_KEY,
-        index_name=INDEX_NAME,
-        dimension=EMBEDDING_DIMENSION,
-        cloud=CLOUD,
-        region=REGION
-    )

+import os
+import time
+from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
+import openai
+import hashlib
+from processing import extract_text, preprocess_text_generalized
+# Load environment variables from .env file
+load_dotenv()
+# Get Pinecone and OpenAI API keys from .env
+PINECONE_API_KEY = "pcsk_5WHwVh_KDJweAYhnFumnCH9xev2acovBo77gK54w6pVWEGnJ8cWe1AGy4bsEzNqVY2JWmX"                                #os.getenv("PINECONE_API_KEY")
+OPENAI_API_KEY = "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"                          #os.getenv("OPENAI_API_KEY")
+INDEX_NAME = "document-embeddings"
+EMBEDDING_DIMENSION = 1536  # OpenAI's embeddings dimension for `text-embedding-ada-002`
+CLOUD = "aws"
+REGION = "us-east-1"
+# Set OpenAI API key
+openai.api_key = OPENAI_API_KEY
+# Initialize Pinecone
+def initialize_pinecone(api_key, index_name, dimension, cloud="aws", region="us-east-1"):
+    """
+    Initializes Pinecone and creates an index if it doesn't exist.
+    """
+    # Create a Pinecone client instance
+    pc = Pinecone(api_key=api_key)
+    # Check if the index exists; if not, create it
+    if index_name not in pc.list_indexes().names():
+        print(f"Index '{index_name}' does not exist. Creating a new index...")
+        pc.create_index(
+            name=index_name,
+            dimension=dimension,
+            metric="cosine",
+            spec=ServerlessSpec(cloud=cloud, region=region)
+        )
+    # Wait for the index to be ready
+    while not pc.describe_index(index_name).status["ready"]:
+        print("Waiting for index to be ready...")
+        time.sleep(1)
+    # Return the Pinecone Index object
+    return pc.Index(index_name)
+# Save embeddings to Pinecone vector DB
+from pinecone.core.openapi.shared.exceptions import NotFoundException
+def save_embeddings_to_pinecone(index, embeddings, metadata, namespace="default"):
+    """
+    Save embeddings to Pinecone. Clears old embeddings if they exist.
+    """
+    try:
+        # Check if the namespace exists before attempting deletion
+        index_description = index.describe_index_stats()
+        if namespace in index_description.get("namespaces", {}):
+            index.delete(delete_all=True, namespace=namespace)
+            print(f"Cleared all previous embeddings in namespace: {namespace}")
+        else:
+            print(f"Namespace '{namespace}' not found. Proceeding to save new embeddings.")
+    except Exception as e:
+        print(f"Error while checking/deleting embeddings in namespace {namespace}: {e}")
+    if embeddings:
+        vectors = [
+            {"id": f"doc_{i}", "values": embedding, "metadata": metadata}
+            for i, embedding in enumerate(embeddings)
+        ]
+        index.upsert(vectors=vectors, namespace=namespace)
+        print(f"Saved embeddings to namespace: {namespace}")
+    else:
+        print("No embeddings to save. Skipping upsert operation.")
+# Generate embeddings using OpenAI API
+def get_openai_embeddings(text, model="text-embedding-ada-002"):
+    """
+    Generate embeddings for a given text using OpenAI's embedding model.
+    Handles splitting text into chunks if it exceeds the token limit.
+    """
+    max_tokens = 8192  # Adjust based on the model's maximum token limit
+    try:
+        # Split text into smaller chunks
+        chunks = [text[i:i + max_tokens] for i in range(0, len(text), max_tokens)]
+        embeddings = []
+        for chunk in chunks:
+            response = openai.Embedding.create(input=chunk, model=model)
+            embeddings.extend([embedding["embedding"] for embedding in response["data"]])
+        return embeddings
+    except Exception as e:
+        print(f"Error generating embeddings with OpenAI API: {e}")
+        return None
+# Query Pinecone for relevant embeddings
+def query_pinecone(index, query_embedding, namespace="default", top_k=3):
+    """
+    Retrieve relevant embeddings from Pinecone using similarity search.
+    """
+    results = index.query(
+        vector=query_embedding,
+        namespace=namespace,
+        top_k=top_k,
+        include_metadata=True
+    )
+    return results["matches"]  # Returns the top-k matches with metadata
+# Pipeline for handling file uploads and updating Pinecone vector DB
+# Global variable to track the previous file hash
+previous_file_hash = None
+def calculate_file_hash(file_path):
+    """
+    Calculate a hash for the uploaded file to uniquely identify it.
+    """
+    hasher = hashlib.md5()
+    with open(file_path, "rb") as f:
+        while chunk := f.read(8192):
+            hasher.update(chunk)
+    return hasher.hexdigest()
+def handle_file_upload(file_path, pinecone_index, namespace="default"):
+    """
+    Handle the process of uploading a file, clearing old embeddings,
+    and saving new embeddings dynamically.
+    """
+    global previous_file_hash
+    current_file_hash = calculate_file_hash(file_path)
+    if current_file_hash == previous_file_hash:
+        print(f"File '{file_path}' is identical to the previously uploaded file. Skipping processing.")
+        return
+    try:
+        text = extract_text(file_path)
+        processed_text = preprocess_text_generalized(text)
+        # Generate embeddings
+        embeddings = get_openai_embeddings(processed_text)
+        if embeddings:
+            metadata = {"file_name": os.path.basename(file_path), "text": processed_text}
+            save_embeddings_to_pinecone(pinecone_index, embeddings, metadata, namespace)
+            previous_file_hash = current_file_hash
+        else:
+            print("Failed to generate embeddings. Skipping save operation.")
+    except Exception as e:
+        print(f"Error processing file upload: {e}")
+# Example usage
+if __name__ == "__main__":
+    # Initialize Pinecone with serverless specifications
+    pinecone_index = initialize_pinecone(
+        api_key=PINECONE_API_KEY,
+        index_name=INDEX_NAME,
+        dimension=EMBEDDING_DIMENSION,
+        cloud=CLOUD,
+        region=REGION
+    )