kamkol commited on
Commit
0be47a9
·
1 Parent(s): 87e184e

Simplify OpenAIEmbeddings initialization and improve error handling

Browse files
Files changed (1) hide show
  1. streamlit_app.py +110 -101
streamlit_app.py CHANGED
@@ -114,27 +114,21 @@ def get_embedding_model():
114
  from langchain_openai import OpenAIEmbeddings
115
  import os
116
 
117
- # The simplest initialization possible - just model name
118
  try:
 
 
 
 
119
  return OpenAIEmbeddings(model="text-embedding-3-small")
120
  except Exception as e:
121
- print(f"OpenAIEmbeddings initialization error: {str(e)}")
122
-
123
- # Try with just API key, no other parameters
124
  try:
125
- return OpenAIEmbeddings(
126
- model="text-embedding-3-small",
127
- openai_api_key=os.environ.get("OPENAI_API_KEY")
128
- )
129
  except Exception as e2:
130
- print(f"Second attempt failed: {str(e2)}")
131
-
132
- # Last resort - most minimal initialization
133
- return OpenAIEmbeddings(
134
- model="text-embedding-3-small",
135
- openai_api_key=os.environ.get("OPENAI_API_KEY"),
136
- client=None # Let the class create its own client
137
- )
138
 
139
  @st.cache_resource
140
  def setup_qdrant_client():
@@ -160,100 +154,115 @@ def setup_qdrant_client():
160
  def retrieve_documents(query, k=5):
161
  """Retrieve relevant documents for a query."""
162
  # Get models and data
163
- embedding_model = get_embedding_model()
164
- chunks = load_document_chunks()
165
- client = setup_qdrant_client()
166
-
167
- # Create a mapping of IDs to documents
168
- docs_by_id = {i: doc for i, doc in enumerate(chunks)}
169
-
170
- # Get query embedding
171
- query_embedding = embedding_model.embed_query(query)
172
-
173
- # Search Qdrant
174
  try:
175
- # Try the new API method first
176
- results = client.query_points(
177
- collection_name="kohavi_ab_testing_pdf_collection",
178
- query_vector=query_embedding,
179
- limit=k
180
- )
181
- print("Successfully used query_points method")
182
- except Exception as e:
183
- print(f"Error with query_points method: {str(e)}")
 
 
 
184
  try:
185
- # Try a different parameter format
186
  results = client.query_points(
187
  collection_name="kohavi_ab_testing_pdf_collection",
188
  query_vector=query_embedding,
189
- with_payload=True,
190
- with_vectors=False,
191
  limit=k
192
  )
193
- print("Successfully used query_points with alternate parameters")
194
- except Exception as e2:
195
- print(f"Error with alternate query_points: {str(e2)}")
196
- # Fall back to the deprecated method as last resort
197
- results = client.search(
198
- collection_name="kohavi_ab_testing_pdf_collection",
199
- query_vector=query_embedding,
200
- limit=k
201
- )
202
- print("Using deprecated search method")
203
-
204
- # Convert results to documents
205
- documents = []
206
- sources_dict = {} # Use a dictionary to track unique sources by file+page
207
-
208
- print(f"Retrieved {len(results)} search results")
209
-
210
- for result in results:
211
- doc_id = result.id
212
- if doc_id in docs_by_id:
213
- doc = docs_by_id[doc_id]
214
- documents.append(doc)
215
-
216
- # Debug the metadata
217
- print(f"Document metadata: {doc.metadata}")
218
-
219
- # Extract source info
220
- source_path = doc.metadata.get("source", "")
221
- filename = source_path.split("/")[-1] if "/" in source_path else source_path
222
-
223
- # Remove .pdf extension if present
224
- if filename.lower().endswith('.pdf'):
225
- filename = filename[:-4]
226
 
227
- # Default to the full filename if we can't extract a title
228
- if not filename:
229
- filename = "Unknown Source"
 
 
 
 
 
 
 
 
230
 
231
- # Get page number, use a default if not available
232
- page = doc.metadata.get("page", "unknown")
233
-
234
- # All PDF sources in data directory are by Ron Kohavi, so add his name as prefix
235
- title = f"Ron Kohavi: {filename}"
236
-
237
- # Create a unique key for this source based on filename and page
238
- source_key = f"{filename}_{page}"
239
-
240
- # Only add to sources if we haven't seen this exact source (same file, same page) before
241
- if source_key not in sources_dict:
242
- sources_dict[source_key] = {
243
- "title": title,
244
- "page": page,
245
- "score": float(result.score),
246
- "type": "pdf"
247
- }
248
- print(f"Added source: {title}, Page: {page}")
249
- else:
250
- print(f"Skipping duplicate source: {title}, Page: {page}")
251
-
252
- # Convert the dictionary of unique sources back to a list
253
- sources = list(sources_dict.values())
254
-
255
- print(f"Returning {len(documents)} documents with {len(sources)} unique sources")
256
- return documents, sources
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  def rephrase_query(query):
259
  """Rephrase the query to improve retrieval."""
 
114
  from langchain_openai import OpenAIEmbeddings
115
  import os
116
 
117
+ # Simplest possible initialization
118
  try:
119
+ api_key = os.environ.get("OPENAI_API_KEY", "")
120
+ print(f"Using API key: {api_key[:4]}...{api_key[-4:] if len(api_key) > 8 else ''}")
121
+
122
+ # Most minimal initialization - one parameter only
123
  return OpenAIEmbeddings(model="text-embedding-3-small")
124
  except Exception as e:
125
+ print(f"Error initializing embeddings: {str(e)}")
126
+ # Try more minimal approach (in case model param is causing issues)
 
127
  try:
128
+ return OpenAIEmbeddings()
 
 
 
129
  except Exception as e2:
130
+ print(f"Final attempt to initialize embeddings failed: {str(e2)}")
131
+ raise
 
 
 
 
 
 
132
 
133
  @st.cache_resource
134
  def setup_qdrant_client():
 
154
  def retrieve_documents(query, k=5):
155
  """Retrieve relevant documents for a query."""
156
  # Get models and data
 
 
 
 
 
 
 
 
 
 
 
157
  try:
158
+ embedding_model = get_embedding_model()
159
+ chunks = load_document_chunks()
160
+ client = setup_qdrant_client()
161
+
162
+ # Create a mapping of IDs to documents
163
+ docs_by_id = {i: doc for i, doc in enumerate(chunks)}
164
+
165
+ # Get query embedding
166
+ query_embedding = embedding_model.embed_query(query)
167
+
168
+ # Try various search methods until one works
169
+ results = None
170
  try:
171
+ # Try simplest query_points call
172
  results = client.query_points(
173
  collection_name="kohavi_ab_testing_pdf_collection",
174
  query_vector=query_embedding,
 
 
175
  limit=k
176
  )
177
+ print("Successfully used query_points method")
178
+ except Exception as e:
179
+ print(f"First query attempt failed: {str(e)}")
180
+ try:
181
+ # Try with explicit parameters
182
+ results = client.query_points(
183
+ collection_name="kohavi_ab_testing_pdf_collection",
184
+ query_vector=query_embedding,
185
+ with_payload=True,
186
+ limit=k
187
+ )
188
+ print("Successfully used query_points with explicit parameters")
189
+ except Exception as e2:
190
+ print(f"Second query attempt failed: {str(e2)}")
191
+ try:
192
+ # Fall back to deprecated search method
193
+ results = client.search(
194
+ collection_name="kohavi_ab_testing_pdf_collection",
195
+ query_vector=query_embedding,
196
+ limit=k
197
+ )
198
+ print("Successfully used deprecated search method")
199
+ except Exception as e3:
200
+ print(f"All query methods failed: {str(e3)}")
201
+ # No results found - return empty list
202
+ return [], []
203
+
204
+ # If we got here but results is still None, return empty lists
205
+ if results is None:
206
+ print("No results found with any query method")
207
+ return [], []
 
 
208
 
209
+ # Convert results to documents
210
+ documents = []
211
+ sources_dict = {} # Use a dictionary to track unique sources by file+page
212
+
213
+ print(f"Retrieved {len(results)} search results")
214
+
215
+ for result in results:
216
+ doc_id = result.id
217
+ if doc_id in docs_by_id:
218
+ doc = docs_by_id[doc_id]
219
+ documents.append(doc)
220
 
221
+ # Debug the metadata
222
+ print(f"Document metadata: {doc.metadata}")
223
+
224
+ # Extract source info
225
+ source_path = doc.metadata.get("source", "")
226
+ filename = source_path.split("/")[-1] if "/" in source_path else source_path
227
+
228
+ # Remove .pdf extension if present
229
+ if filename.lower().endswith('.pdf'):
230
+ filename = filename[:-4]
231
+
232
+ # Default to the full filename if we can't extract a title
233
+ if not filename:
234
+ filename = "Unknown Source"
235
+
236
+ # Get page number, use a default if not available
237
+ page = doc.metadata.get("page", "unknown")
238
+
239
+ # All PDF sources in data directory are by Ron Kohavi, so add his name as prefix
240
+ title = f"Ron Kohavi: {filename}"
241
+
242
+ # Create a unique key for this source based on filename and page
243
+ source_key = f"{filename}_{page}"
244
+
245
+ # Only add to sources if we haven't seen this exact source (same file, same page) before
246
+ if source_key not in sources_dict:
247
+ sources_dict[source_key] = {
248
+ "title": title,
249
+ "page": page,
250
+ "score": float(result.score),
251
+ "type": "pdf"
252
+ }
253
+ print(f"Added source: {title}, Page: {page}")
254
+ else:
255
+ print(f"Skipping duplicate source: {title}, Page: {page}")
256
+
257
+ # Convert the dictionary of unique sources back to a list
258
+ sources = list(sources_dict.values())
259
+
260
+ print(f"Returning {len(documents)} documents with {len(sources)} unique sources")
261
+ return documents, sources
262
+ except Exception as e:
263
+ print(f"Error in retrieve_documents: {str(e)}")
264
+ # Return empty results in case of any error
265
+ return [], []
266
 
267
  def rephrase_query(query):
268
  """Rephrase the query to improve retrieval."""