kamkol commited on
Commit
4f7e4ee
·
1 Parent(s): f4ce103

Major fix: Update LangChain initialization and improve error handling for Hugging Face compatibility

Browse files
Files changed (1) hide show
  1. streamlit_app.py +144 -140
streamlit_app.py CHANGED
@@ -95,199 +95,203 @@ def load_document_chunks():
95
  @st.cache_resource
96
  def get_chat_model():
97
  """Get the chat model for initial RAG."""
98
- try:
99
- # First attempt with minimal params
100
- return ChatOpenAI(
101
- model="gpt-4.1-mini",
102
- temperature=0,
103
- )
104
- except Exception as e:
105
- print(f"Error initializing chat model: {str(e)}")
106
- # Try with just model name
107
- try:
108
- return ChatOpenAI(
109
- model="gpt-4.1-mini"
110
- )
111
- except Exception as e2:
112
- print(f"Final attempt for chat model: {str(e2)}")
113
- # Last resort with no parameters
114
- return ChatOpenAI()
115
 
116
  @st.cache_resource
117
  def get_agent_model():
118
  """Get the more powerful model for agent and evaluation."""
119
- try:
120
- # First attempt with minimal params
121
- return ChatOpenAI(
122
- model="gpt-4.1",
123
- temperature=0,
124
- )
125
- except Exception as e:
126
- print(f"Error initializing agent model: {str(e)}")
127
- # Try with just model name
128
- try:
129
- return ChatOpenAI(
130
- model="gpt-4.1"
131
- )
132
- except Exception as e2:
133
- print(f"Final attempt for agent model: {str(e2)}")
134
- # Last resort with no parameters
135
- return ChatOpenAI()
136
 
137
  @st.cache_resource
138
  def get_embedding_model():
139
  """Get the embedding model."""
140
- from langchain_openai import OpenAIEmbeddings
141
  import os
 
142
 
143
- # Simplest possible initialization
144
- try:
145
- api_key = os.environ.get("OPENAI_API_KEY", "")
146
- print(f"Using API key: {api_key[:4]}...{api_key[-4:] if len(api_key) > 8 else ''}")
147
-
148
- # Most minimal initialization - one parameter only
149
- return OpenAIEmbeddings(model="text-embedding-3-small")
150
- except Exception as e:
151
- print(f"Error initializing embeddings: {str(e)}")
152
- # Try more minimal approach (in case model param is causing issues)
153
- try:
154
- return OpenAIEmbeddings()
155
- except Exception as e2:
156
- print(f"Final attempt to initialize embeddings failed: {str(e2)}")
157
- raise
158
 
159
  @st.cache_resource
160
  def setup_qdrant_client():
161
  """Set up the Qdrant client."""
 
 
 
 
 
 
 
 
 
 
162
  try:
163
  return QdrantClient(path=str(QDRANT_DIR))
164
  except Exception as e:
165
- # If there's an issue with the standard approach, print diagnostics and retry
166
- print(f"QdrantClient initialization error: {str(e)}")
167
- print(f"Checking if directory exists: {os.path.exists(str(QDRANT_DIR))}")
168
 
169
- # Try alternative approach with explicit collection params
170
- if os.path.exists(str(QDRANT_DIR)):
171
- try:
172
- # Try with location parameter instead
173
- return QdrantClient(location=str(QDRANT_DIR))
174
- except Exception as e2:
175
- print(f"Alternative initialization also failed: {str(e2)}")
176
- raise
177
- else:
178
- raise ValueError(f"Qdrant directory does not exist: {str(QDRANT_DIR)}")
179
 
180
  def retrieve_documents(query, k=5):
181
  """Retrieve relevant documents for a query."""
182
- # Get models and data
 
 
183
  try:
184
- embedding_model = get_embedding_model()
185
- chunks = load_document_chunks()
186
- client = setup_qdrant_client()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  # Create a mapping of IDs to documents
189
  docs_by_id = {i: doc for i, doc in enumerate(chunks)}
190
 
191
  # Get query embedding
192
- query_embedding = embedding_model.embed_query(query)
 
 
 
 
 
193
 
194
- # Try various search methods until one works
195
  results = None
 
 
196
  try:
197
- # Try simplest query_points call
198
  results = client.query_points(
199
- collection_name="kohavi_ab_testing_pdf_collection",
200
  query_vector=query_embedding,
201
  limit=k
202
  )
203
- print("Successfully used query_points method")
204
- except Exception as e:
205
- print(f"First query attempt failed: {str(e)}")
 
206
  try:
207
- # Try with explicit parameters
208
- results = client.query_points(
209
- collection_name="kohavi_ab_testing_pdf_collection",
210
  query_vector=query_embedding,
211
- with_payload=True,
212
  limit=k
213
  )
214
- print("Successfully used query_points with explicit parameters")
215
  except Exception as e2:
216
- print(f"Second query attempt failed: {str(e2)}")
217
- try:
218
- # Fall back to deprecated search method
219
- results = client.search(
220
- collection_name="kohavi_ab_testing_pdf_collection",
221
- query_vector=query_embedding,
222
- limit=k
223
- )
224
- print("Successfully used deprecated search method")
225
- except Exception as e3:
226
- print(f"All query methods failed: {str(e3)}")
227
- # No results found - return empty list
228
- return [], []
229
 
230
- # If we got here but results is still None, return empty lists
231
- if results is None:
232
- print("No results found with any query method")
233
  return [], []
234
 
235
- # Convert results to documents
236
  documents = []
237
- sources_dict = {} # Use a dictionary to track unique sources by file+page
238
 
239
- print(f"Retrieved {len(results)} search results")
240
 
241
  for result in results:
242
- doc_id = result.id
243
- if doc_id in docs_by_id:
244
- doc = docs_by_id[doc_id]
245
- documents.append(doc)
246
-
247
- # Debug the metadata
248
- print(f"Document metadata: {doc.metadata}")
249
-
250
- # Extract source info
251
- source_path = doc.metadata.get("source", "")
252
- filename = source_path.split("/")[-1] if "/" in source_path else source_path
253
-
254
- # Remove .pdf extension if present
255
- if filename.lower().endswith('.pdf'):
256
- filename = filename[:-4]
257
-
258
- # Default to the full filename if we can't extract a title
259
- if not filename:
260
- filename = "Unknown Source"
261
 
262
- # Get page number, use a default if not available
263
- page = doc.metadata.get("page", "unknown")
264
-
265
- # All PDF sources in data directory are by Ron Kohavi, so add his name as prefix
266
- title = f"Ron Kohavi: {filename}"
267
-
268
- # Create a unique key for this source based on filename and page
269
- source_key = f"{filename}_{page}"
270
-
271
- # Only add to sources if we haven't seen this exact source (same file, same page) before
272
- if source_key not in sources_dict:
273
- sources_dict[source_key] = {
274
- "title": title,
275
- "page": page,
276
- "score": float(result.score),
277
- "type": "pdf"
278
- }
279
- print(f"Added source: {title}, Page: {page}")
280
- else:
281
- print(f"Skipping duplicate source: {title}, Page: {page}")
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
- # Convert the dictionary of unique sources back to a list
284
  sources = list(sources_dict.values())
285
 
286
  print(f"Returning {len(documents)} documents with {len(sources)} unique sources")
287
  return documents, sources
 
288
  except Exception as e:
289
- print(f"Error in retrieve_documents: {str(e)}")
290
- # Return empty results in case of any error
291
  return [], []
292
 
293
  def rephrase_query(query):
 
95
  @st.cache_resource
96
  def get_chat_model():
97
  """Get the chat model for initial RAG."""
98
+ import os
99
+ # Most minimal initialization possible for Hugging Face environment
100
+ api_key = os.environ.get("OPENAI_API_KEY", "")
101
+ print(f"Initializing chat model with API key starting with: {api_key[:4]}...")
102
+ return ChatOpenAI(api_key=api_key, model_name="gpt-4.1-mini")
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  @st.cache_resource
105
  def get_agent_model():
106
  """Get the more powerful model for agent and evaluation."""
107
+ import os
108
+ # Most minimal initialization possible for Hugging Face environment
109
+ api_key = os.environ.get("OPENAI_API_KEY", "")
110
+ print(f"Initializing agent model with API key starting with: {api_key[:4]}...")
111
+ return ChatOpenAI(api_key=api_key, model_name="gpt-4.1")
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  @st.cache_resource
114
  def get_embedding_model():
115
  """Get the embedding model."""
 
116
  import os
117
+ from langchain_openai import OpenAIEmbeddings
118
 
119
+ # Absolutely minimal initialization for Hugging Face compatibility
120
+ api_key = os.environ.get("OPENAI_API_KEY", "")
121
+ print(f"Initializing embeddings with API key starting with: {api_key[:4]}...")
122
+
123
+ # Minimal parameters - only model_name and api_key
124
+ return OpenAIEmbeddings(
125
+ model="text-embedding-3-small",
126
+ api_key=api_key
127
+ )
 
 
 
 
 
 
128
 
129
  @st.cache_resource
130
  def setup_qdrant_client():
131
  """Set up the Qdrant client."""
132
+ import os
133
+
134
+ print(f"Setting up Qdrant client with path: {str(QDRANT_DIR)}")
135
+
136
+ # Check if directory exists
137
+ if not os.path.exists(QDRANT_DIR):
138
+ print(f"WARNING: Qdrant directory does not exist: {str(QDRANT_DIR)}")
139
+ raise ValueError(f"Qdrant directory not found at {str(QDRANT_DIR)}")
140
+
141
+ # Try creating the client with minimal parameters
142
  try:
143
  return QdrantClient(path=str(QDRANT_DIR))
144
  except Exception as e:
145
+ print(f"Error initializing QdrantClient with path: {str(e)}")
 
 
146
 
147
+ # Try with location parameter
148
+ try:
149
+ return QdrantClient(location=str(QDRANT_DIR))
150
+ except Exception as e2:
151
+ print(f"Error initializing with location: {str(e2)}")
152
+
153
+ # Last attempt with in-memory client
154
+ print("Attempting to create in-memory client")
155
+ return QdrantClient(":memory:")
 
156
 
157
  def retrieve_documents(query, k=5):
158
  """Retrieve relevant documents for a query."""
159
+ # Define collection name
160
+ collection_name = "kohavi_ab_testing_pdf_collection"
161
+
162
  try:
163
+ print(f"Starting document retrieval for query: '{query[:30]}...'")
164
+
165
+ # Get models and data
166
+ try:
167
+ embedding_model = get_embedding_model()
168
+ except Exception as e:
169
+ print(f"Error getting embedding model: {str(e)}")
170
+ return [], []
171
+
172
+ try:
173
+ chunks = load_document_chunks()
174
+ print(f"Loaded {len(chunks)} document chunks")
175
+ except Exception as e:
176
+ print(f"Error loading document chunks: {str(e)}")
177
+ return [], []
178
+
179
+ try:
180
+ client = setup_qdrant_client()
181
+ print("Successfully created Qdrant client")
182
+ except Exception as e:
183
+ print(f"Error setting up Qdrant client: {str(e)}")
184
+ return [], []
185
+
186
+ # Check if collection exists
187
+ try:
188
+ collections = client.get_collections()
189
+ print(f"Available collections: {collections}")
190
+
191
+ collection_info = client.get_collection(collection_name)
192
+ print(f"Collection info: {collection_info}")
193
+ except Exception as e:
194
+ print(f"Error checking collection: {str(e)}")
195
+ return [], []
196
 
197
  # Create a mapping of IDs to documents
198
  docs_by_id = {i: doc for i, doc in enumerate(chunks)}
199
 
200
  # Get query embedding
201
+ try:
202
+ query_embedding = embedding_model.embed_query(query)
203
+ print(f"Generated embedding of length {len(query_embedding)}")
204
+ except Exception as e:
205
+ print(f"Error creating query embedding: {str(e)}")
206
+ return [], []
207
 
208
+ # Search for relevant documents
209
  results = None
210
+
211
+ # Try different querying approaches
212
  try:
213
+ # Simple query_points call
214
  results = client.query_points(
215
+ collection_name=collection_name,
216
  query_vector=query_embedding,
217
  limit=k
218
  )
219
+ print(f"Retrieved {len(results)} results with query_points")
220
+ except Exception as e1:
221
+ print(f"First query approach failed: {str(e1)}")
222
+
223
  try:
224
+ # Try with minimum parameters
225
+ results = client.search(
226
+ collection_name=collection_name,
227
  query_vector=query_embedding,
 
228
  limit=k
229
  )
230
+ print(f"Retrieved {len(results)} results with search method")
231
  except Exception as e2:
232
+ print(f"Second query approach failed: {str(e2)}")
233
+ return [], []
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ # Handle empty results
236
+ if not results:
237
+ print("No results found in vector store")
238
  return [], []
239
 
240
+ # Process results
241
  documents = []
242
+ sources_dict = {}
243
 
244
+ print(f"Processing {len(results)} search results")
245
 
246
  for result in results:
247
+ try:
248
+ doc_id = result.id
249
+ if doc_id in docs_by_id:
250
+ doc = docs_by_id[doc_id]
251
+ documents.append(doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ # Extract metadata for sources
254
+ source_path = doc.metadata.get("source", "")
255
+ filename = source_path.split("/")[-1] if "/" in source_path else source_path
256
+
257
+ # Remove .pdf extension if present
258
+ if filename.lower().endswith('.pdf'):
259
+ filename = filename[:-4]
260
+
261
+ # Default to the full filename if we can't extract a title
262
+ if not filename:
263
+ filename = "Unknown Source"
264
+
265
+ # Get page number, use a default if not available
266
+ page = doc.metadata.get("page", "unknown")
267
+
268
+ # Add prefix for consistency
269
+ title = f"Ron Kohavi: {filename}"
270
+
271
+ # Create a unique key for this source
272
+ source_key = f"{filename}_{page}"
273
+
274
+ # Only add unique sources
275
+ if source_key not in sources_dict:
276
+ sources_dict[source_key] = {
277
+ "title": title,
278
+ "page": page,
279
+ "score": float(result.score),
280
+ "type": "pdf"
281
+ }
282
+ print(f"Added source: {title}, Page: {page}")
283
+ except Exception as e:
284
+ print(f"Error processing result: {str(e)}")
285
+ continue
286
 
287
+ # Convert sources dictionary to list
288
  sources = list(sources_dict.values())
289
 
290
  print(f"Returning {len(documents)} documents with {len(sources)} unique sources")
291
  return documents, sources
292
+
293
  except Exception as e:
294
+ print(f"Unexpected error in retrieve_documents: {str(e)}")
 
295
  return [], []
296
 
297
  def rephrase_query(query):