Jagukumar commited on
Commit
e582eae
·
verified ·
1 Parent(s): d642926

Update pin.py

Browse files
Files changed (1) hide show
  1. pin.py +170 -170
pin.py CHANGED
@@ -1,170 +1,170 @@
1
- import os
2
- import time
3
- from dotenv import load_dotenv
4
- from pinecone import Pinecone, ServerlessSpec
5
- import openai
6
- import hashlib
7
- from processing import extract_text, preprocess_text_generalized
8
-
9
- # Load environment variables from .env file
10
- load_dotenv()
11
-
12
- # Get Pinecone and OpenAI API keys from .env
13
- PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
14
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
- INDEX_NAME = "document-embeddings"
16
- EMBEDDING_DIMENSION = 1536 # OpenAI's embeddings dimension for `text-embedding-ada-002`
17
- CLOUD = "aws"
18
- REGION = "us-east-1"
19
-
20
- # Set OpenAI API key
21
- openai.api_key = OPENAI_API_KEY
22
-
23
-
24
- # Initialize Pinecone
25
- def initialize_pinecone(api_key, index_name, dimension, cloud="aws", region="us-east-1"):
26
- """
27
- Initializes Pinecone and creates an index if it doesn't exist.
28
- """
29
- # Create a Pinecone client instance
30
- pc = Pinecone(api_key=api_key)
31
-
32
- # Check if the index exists; if not, create it
33
- if index_name not in pc.list_indexes().names():
34
- print(f"Index '{index_name}' does not exist. Creating a new index...")
35
- pc.create_index(
36
- name=index_name,
37
- dimension=dimension,
38
- metric="cosine",
39
- spec=ServerlessSpec(cloud=cloud, region=region)
40
- )
41
-
42
- # Wait for the index to be ready
43
- while not pc.describe_index(index_name).status["ready"]:
44
- print("Waiting for index to be ready...")
45
- time.sleep(1)
46
-
47
- # Return the Pinecone Index object
48
- return pc.Index(index_name)
49
-
50
-
51
- # Save embeddings to Pinecone vector DB
52
- from pinecone.core.openapi.shared.exceptions import NotFoundException
53
-
54
-
55
- def save_embeddings_to_pinecone(index, embeddings, metadata, namespace="default"):
56
- """
57
- Save embeddings to Pinecone. Clears old embeddings if they exist.
58
- """
59
- try:
60
- # Check if the namespace exists before attempting deletion
61
- index_description = index.describe_index_stats()
62
- if namespace in index_description.get("namespaces", {}):
63
- index.delete(delete_all=True, namespace=namespace)
64
- print(f"Cleared all previous embeddings in namespace: {namespace}")
65
- else:
66
- print(f"Namespace '{namespace}' not found. Proceeding to save new embeddings.")
67
- except Exception as e:
68
- print(f"Error while checking/deleting embeddings in namespace {namespace}: {e}")
69
-
70
- if embeddings:
71
- vectors = [
72
- {"id": f"doc_{i}", "values": embedding, "metadata": metadata}
73
- for i, embedding in enumerate(embeddings)
74
- ]
75
- index.upsert(vectors=vectors, namespace=namespace)
76
- print(f"Saved embeddings to namespace: {namespace}")
77
- else:
78
- print("No embeddings to save. Skipping upsert operation.")
79
-
80
-
81
-
82
- # Generate embeddings using OpenAI API
83
- def get_openai_embeddings(text, model="text-embedding-ada-002"):
84
- """
85
- Generate embeddings for a given text using OpenAI's embedding model.
86
- Handles splitting text into chunks if it exceeds the token limit.
87
- """
88
- max_tokens = 8192 # Adjust based on the model's maximum token limit
89
- try:
90
- # Split text into smaller chunks
91
- chunks = [text[i:i + max_tokens] for i in range(0, len(text), max_tokens)]
92
- embeddings = []
93
- for chunk in chunks:
94
- response = openai.Embedding.create(input=chunk, model=model)
95
- embeddings.extend([embedding["embedding"] for embedding in response["data"]])
96
- return embeddings
97
- except Exception as e:
98
- print(f"Error generating embeddings with OpenAI API: {e}")
99
- return None
100
-
101
- # Query Pinecone for relevant embeddings
102
- def query_pinecone(index, query_embedding, namespace="default", top_k=3):
103
- """
104
- Retrieve relevant embeddings from Pinecone using similarity search.
105
- """
106
- results = index.query(
107
- vector=query_embedding,
108
- namespace=namespace,
109
- top_k=top_k,
110
- include_metadata=True
111
- )
112
- return results["matches"] # Returns the top-k matches with metadata
113
-
114
-
115
- # Pipeline for handling file uploads and updating Pinecone vector DB
116
- # Global variable to track the previous file hash
117
- previous_file_hash = None
118
-
119
- def calculate_file_hash(file_path):
120
- """
121
- Calculate a hash for the uploaded file to uniquely identify it.
122
- """
123
- hasher = hashlib.md5()
124
- with open(file_path, "rb") as f:
125
- while chunk := f.read(8192):
126
- hasher.update(chunk)
127
- return hasher.hexdigest()
128
-
129
- def handle_file_upload(file_path, pinecone_index, namespace="default"):
130
- """
131
- Handle the process of uploading a file, clearing old embeddings,
132
- and saving new embeddings dynamically.
133
- """
134
- global previous_file_hash
135
-
136
- current_file_hash = calculate_file_hash(file_path)
137
- if current_file_hash == previous_file_hash:
138
- print(f"File '{file_path}' is identical to the previously uploaded file. Skipping processing.")
139
- return
140
-
141
- try:
142
- text = extract_text(file_path)
143
- processed_text = preprocess_text_generalized(text)
144
-
145
- # Generate embeddings
146
- embeddings = get_openai_embeddings(processed_text)
147
- if embeddings:
148
- metadata = {"file_name": os.path.basename(file_path), "text": processed_text}
149
- save_embeddings_to_pinecone(pinecone_index, embeddings, metadata, namespace)
150
- previous_file_hash = current_file_hash
151
- else:
152
- print("Failed to generate embeddings. Skipping save operation.")
153
- except Exception as e:
154
- print(f"Error processing file upload: {e}")
155
-
156
-
157
-
158
-
159
- # Example usage
160
- if __name__ == "__main__":
161
- # Initialize Pinecone with serverless specifications
162
- pinecone_index = initialize_pinecone(
163
- api_key=PINECONE_API_KEY,
164
- index_name=INDEX_NAME,
165
- dimension=EMBEDDING_DIMENSION,
166
- cloud=CLOUD,
167
- region=REGION
168
- )
169
-
170
-
 
1
+ import os
2
+ import time
3
+ from dotenv import load_dotenv
4
+ from pinecone import Pinecone, ServerlessSpec
5
+ import openai
6
+ import hashlib
7
+ from processing import extract_text, preprocess_text_generalized
8
+
9
+ # Load environment variables from .env file
10
+ load_dotenv()
11
+
12
+ # Get Pinecone and OpenAI API keys from .env
13
+ PINECONE_API_KEY = "pcsk_5WHwVh_KDJweAYhnFumnCH9xev2acovBo77gK54w6pVWEGnJ8cWe1AGy4bsEzNqVY2JWmX" #os.getenv("PINECONE_API_KEY")
14
+ OPENAI_API_KEY = "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" #os.getenv("OPENAI_API_KEY")
15
+ INDEX_NAME = "document-embeddings"
16
+ EMBEDDING_DIMENSION = 1536 # OpenAI's embeddings dimension for `text-embedding-ada-002`
17
+ CLOUD = "aws"
18
+ REGION = "us-east-1"
19
+
20
+ # Set OpenAI API key
21
+ openai.api_key = OPENAI_API_KEY
22
+
23
+
24
+ # Initialize Pinecone
25
+ def initialize_pinecone(api_key, index_name, dimension, cloud="aws", region="us-east-1"):
26
+ """
27
+ Initializes Pinecone and creates an index if it doesn't exist.
28
+ """
29
+ # Create a Pinecone client instance
30
+ pc = Pinecone(api_key=api_key)
31
+
32
+ # Check if the index exists; if not, create it
33
+ if index_name not in pc.list_indexes().names():
34
+ print(f"Index '{index_name}' does not exist. Creating a new index...")
35
+ pc.create_index(
36
+ name=index_name,
37
+ dimension=dimension,
38
+ metric="cosine",
39
+ spec=ServerlessSpec(cloud=cloud, region=region)
40
+ )
41
+
42
+ # Wait for the index to be ready
43
+ while not pc.describe_index(index_name).status["ready"]:
44
+ print("Waiting for index to be ready...")
45
+ time.sleep(1)
46
+
47
+ # Return the Pinecone Index object
48
+ return pc.Index(index_name)
49
+
50
+
51
+ # Save embeddings to Pinecone vector DB
52
+ from pinecone.core.openapi.shared.exceptions import NotFoundException
53
+
54
+
55
+ def save_embeddings_to_pinecone(index, embeddings, metadata, namespace="default"):
56
+ """
57
+ Save embeddings to Pinecone. Clears old embeddings if they exist.
58
+ """
59
+ try:
60
+ # Check if the namespace exists before attempting deletion
61
+ index_description = index.describe_index_stats()
62
+ if namespace in index_description.get("namespaces", {}):
63
+ index.delete(delete_all=True, namespace=namespace)
64
+ print(f"Cleared all previous embeddings in namespace: {namespace}")
65
+ else:
66
+ print(f"Namespace '{namespace}' not found. Proceeding to save new embeddings.")
67
+ except Exception as e:
68
+ print(f"Error while checking/deleting embeddings in namespace {namespace}: {e}")
69
+
70
+ if embeddings:
71
+ vectors = [
72
+ {"id": f"doc_{i}", "values": embedding, "metadata": metadata}
73
+ for i, embedding in enumerate(embeddings)
74
+ ]
75
+ index.upsert(vectors=vectors, namespace=namespace)
76
+ print(f"Saved embeddings to namespace: {namespace}")
77
+ else:
78
+ print("No embeddings to save. Skipping upsert operation.")
79
+
80
+
81
+
82
+ # Generate embeddings using OpenAI API
83
+ def get_openai_embeddings(text, model="text-embedding-ada-002"):
84
+ """
85
+ Generate embeddings for a given text using OpenAI's embedding model.
86
+ Handles splitting text into chunks if it exceeds the token limit.
87
+ """
88
+ max_tokens = 8192 # Adjust based on the model's maximum token limit
89
+ try:
90
+ # Split text into smaller chunks
91
+ chunks = [text[i:i + max_tokens] for i in range(0, len(text), max_tokens)]
92
+ embeddings = []
93
+ for chunk in chunks:
94
+ response = openai.Embedding.create(input=chunk, model=model)
95
+ embeddings.extend([embedding["embedding"] for embedding in response["data"]])
96
+ return embeddings
97
+ except Exception as e:
98
+ print(f"Error generating embeddings with OpenAI API: {e}")
99
+ return None
100
+
101
+ # Query Pinecone for relevant embeddings
102
+ def query_pinecone(index, query_embedding, namespace="default", top_k=3):
103
+ """
104
+ Retrieve relevant embeddings from Pinecone using similarity search.
105
+ """
106
+ results = index.query(
107
+ vector=query_embedding,
108
+ namespace=namespace,
109
+ top_k=top_k,
110
+ include_metadata=True
111
+ )
112
+ return results["matches"] # Returns the top-k matches with metadata
113
+
114
+
115
+ # Pipeline for handling file uploads and updating Pinecone vector DB
116
+ # Global variable to track the previous file hash
117
+ previous_file_hash = None
118
+
119
+ def calculate_file_hash(file_path):
120
+ """
121
+ Calculate a hash for the uploaded file to uniquely identify it.
122
+ """
123
+ hasher = hashlib.md5()
124
+ with open(file_path, "rb") as f:
125
+ while chunk := f.read(8192):
126
+ hasher.update(chunk)
127
+ return hasher.hexdigest()
128
+
129
+ def handle_file_upload(file_path, pinecone_index, namespace="default"):
130
+ """
131
+ Handle the process of uploading a file, clearing old embeddings,
132
+ and saving new embeddings dynamically.
133
+ """
134
+ global previous_file_hash
135
+
136
+ current_file_hash = calculate_file_hash(file_path)
137
+ if current_file_hash == previous_file_hash:
138
+ print(f"File '{file_path}' is identical to the previously uploaded file. Skipping processing.")
139
+ return
140
+
141
+ try:
142
+ text = extract_text(file_path)
143
+ processed_text = preprocess_text_generalized(text)
144
+
145
+ # Generate embeddings
146
+ embeddings = get_openai_embeddings(processed_text)
147
+ if embeddings:
148
+ metadata = {"file_name": os.path.basename(file_path), "text": processed_text}
149
+ save_embeddings_to_pinecone(pinecone_index, embeddings, metadata, namespace)
150
+ previous_file_hash = current_file_hash
151
+ else:
152
+ print("Failed to generate embeddings. Skipping save operation.")
153
+ except Exception as e:
154
+ print(f"Error processing file upload: {e}")
155
+
156
+
157
+
158
+
159
+ # Example usage
160
+ if __name__ == "__main__":
161
+ # Initialize Pinecone with serverless specifications
162
+ pinecone_index = initialize_pinecone(
163
+ api_key=PINECONE_API_KEY,
164
+ index_name=INDEX_NAME,
165
+ dimension=EMBEDDING_DIMENSION,
166
+ cloud=CLOUD,
167
+ region=REGION
168
+ )
169
+
170
+