kamkol commited on
Commit
525d5c5
·
1 Parent(s): b4efc69

Add debugging code about the preprocessed data

Browse files
Files changed (3) hide show
  1. .gitignore +34 -3
  2. streamlit_app.py +350 -140
  3. verify_data.py +29 -0
.gitignore CHANGED
@@ -1,4 +1,35 @@
1
- notebook_version/
2
  *.pdf
3
- *.pkl
4
- processed_data/qdrant_vectorstore/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore PDF files
2
  *.pdf
3
+
4
+ # Keep processed data
5
+ !processed_data/
6
+ !processed_data/document_chunks.pkl
7
+ !processed_data/qdrant_vectorstore/
8
+
9
+ # Ignore notebook version folder
10
+ notebook_version/
11
+
12
+ # Python
13
+ __pycache__/
14
+ *.py[cod]
15
+ *$py.class
16
+ *.so
17
+ .Python
18
+ env/
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+
34
+ # Jupyter
35
+ .ipynb_checkpoints
streamlit_app.py CHANGED
@@ -32,11 +32,50 @@ print("Loaded .env file")
32
  if not os.environ.get("OPENAI_API_KEY"):
33
  os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY_BACKUP", "")
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # Paths to pre-processed data
36
- PROCESSED_DATA_DIR = Path("processed_data")
37
  CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
38
  QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
39
 
 
 
 
 
 
 
 
 
40
  # Define prompts exactly as in the notebook
41
  RAG_PROMPT = """
42
  CONTEXT:
@@ -72,6 +111,7 @@ evaluate_prompt = PromptTemplate.from_template(EVALUATE_RESPONSE_PROMPT)
72
  @st.cache_resource
73
  def load_document_chunks():
74
  """Load pre-processed document chunks from disk."""
 
75
  if not os.path.exists(CHUNKS_FILE):
76
  print(f"WARNING: Chunks file not found at {CHUNKS_FILE}")
77
  print(f"Working directory contents: {os.listdir('.')}")
@@ -83,116 +123,185 @@ def load_document_chunks():
83
  with open(CHUNKS_FILE, 'rb') as f:
84
  chunks = pickle.load(f)
85
  print(f"Successfully loaded {len(chunks)} document chunks")
 
 
 
86
  return chunks
87
  except Exception as e:
88
  print(f"Error loading document chunks: {str(e)}")
 
 
89
  return []
90
 
91
  @st.cache_resource
92
  def get_chat_model():
93
  """Get the chat model for initial RAG."""
 
 
94
  try:
95
- # Use direct OpenAI client to avoid proxy issues
96
- openai_client = OpenAI()
97
-
98
- # Create a wrapper that mimics LangChain's interface
99
- class SimpleOpenAIWrapper:
100
- def invoke(self, messages):
101
- # Convert LangChain messages to OpenAI format
102
- openai_messages = []
103
- for msg in messages:
104
- role = "user"
105
- if hasattr(msg, "type"):
106
- role = "assistant" if msg.type == "ai" else "user"
107
- openai_messages.append({
108
- "role": role,
109
- "content": msg.content
110
- })
111
-
112
- # Call API directly
113
- response = openai_client.chat.completions.create(
114
- model="gpt-4.1-mini",
115
- messages=openai_messages,
116
- temperature=0
117
- )
118
-
119
- # Create response object with content attribute
120
- class SimpleResponse:
121
- def __init__(self, content):
122
- self.content = content
123
-
124
- return SimpleResponse(response.choices[0].message.content)
125
-
126
- return SimpleOpenAIWrapper()
127
- except Exception as e:
128
- print(f"Error creating OpenAI wrapper: {str(e)}")
129
  try:
130
- # Last resort fallback to basic LangChain with minimal config
131
- return ChatOpenAI(model="gpt-4.1-mini", temperature=0)
132
- except Exception as e2:
133
- print(f"Fallback also failed: {str(e2)}")
134
 
135
- # Create dummy that returns a fixed response
136
- class DummyModel:
137
  def invoke(self, messages):
138
- class DummyResponse:
139
- def __init__(self):
140
- self.content = "I apologize, but I'm unable to process your query right now. Please try again later."
141
- return DummyResponse()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- return DummyModel()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  @st.cache_resource
146
  def get_agent_model():
147
  """Get the more powerful model for agent and evaluation."""
 
 
148
  try:
149
- # Use same approach as get_chat_model
150
- openai_client = OpenAI()
151
-
152
- class SimpleOpenAIWrapper:
153
- def invoke(self, messages):
154
- # Convert LangChain messages to OpenAI format
155
- openai_messages = []
156
- for msg in messages:
157
- role = "user"
158
- if hasattr(msg, "type"):
159
- role = "assistant" if msg.type == "ai" else "user"
160
- openai_messages.append({
161
- "role": role,
162
- "content": msg.content
163
- })
164
-
165
- # Call API directly with a more powerful model
166
- response = openai_client.chat.completions.create(
167
- model="gpt-4.1",
168
- messages=openai_messages,
169
- temperature=0
170
- )
171
-
172
- class SimpleResponse:
173
- def __init__(self, content):
174
- self.content = content
175
-
176
- return SimpleResponse(response.choices[0].message.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- return SimpleOpenAIWrapper()
179
- except Exception as e:
180
- print(f"Error creating agent model: {str(e)}")
181
  try:
182
- # Fallback
183
- return ChatOpenAI(model="gpt-4.1", temperature=0)
184
- except Exception as e2:
185
- print(f"Agent model fallback also failed: {str(e2)}")
186
- # Final fallback to gpt-3.5-turbo
 
 
 
187
  try:
188
- return ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
189
- except:
190
- # Create dummy that returns a fixed response
 
 
 
 
 
 
191
  class DummyModel:
192
  def invoke(self, messages):
 
193
  class DummyResponse:
194
  def __init__(self):
195
- self.content = "I apologize, but I'm unable to process your query right now. Please try again later."
196
  return DummyResponse()
197
 
198
  return DummyModel()
@@ -200,97 +309,157 @@ def get_agent_model():
200
  @st.cache_resource
201
  def get_embedding_model():
202
  """Get the embedding model."""
 
203
  try:
204
- # Create an OpenAI client directly
205
- openai_client = OpenAI()
206
-
207
- # Create a wrapper class that matches the interface LangChain expects
208
- class SimpleEmbeddings:
209
- def embed_query(self, text):
210
- try:
211
- response = openai_client.embeddings.create(
212
- model="text-embedding-3-small",
213
- input=text
214
- )
215
- return response.data[0].embedding
216
- except Exception as e:
217
- print(f"Error in embed_query: {str(e)}")
218
- # Return a dummy embedding of the right size
219
- return [0.0] * 1536 # Standard size for embeddings
220
-
221
- def embed_documents(self, texts):
222
- try:
223
- if not texts:
224
- return []
225
-
226
- # Embed each text individually to avoid batch size issues
227
- return [self.embed_query(text) for text in texts]
228
- except Exception as e:
229
- print(f"Error in embed_documents: {str(e)}")
230
- # Return dummy embeddings
231
- return [[0.0] * 1536 for _ in range(len(texts))]
232
-
233
- return SimpleEmbeddings()
234
- except Exception as e:
235
- print(f"Error initializing embedding model: {str(e)}")
236
- # Last resort fallback
237
  try:
238
- return OpenAIEmbeddings(model="text-embedding-3-small")
239
- except Exception as e2:
240
- print(f"Embedding fallback also failed: {str(e2)}")
241
 
242
- # Return a dummy embeddings class
243
- class DummyEmbeddings:
244
  def embed_query(self, text):
245
- return [0.0] * 1536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  def embed_documents(self, texts):
248
- return [[0.0] * 1536 for _ in range(len(texts))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
- return DummyEmbeddings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  @st.cache_resource
253
  def setup_qdrant_client():
254
  """Set up the Qdrant client."""
 
255
  # Check if Qdrant dir exists
256
  if not os.path.exists(QDRANT_DIR):
257
  print(f"WARNING: Qdrant directory not found: {QDRANT_DIR}")
258
  print(f"Contents of {PROCESSED_DATA_DIR}: {os.listdir(PROCESSED_DATA_DIR) if os.path.exists(PROCESSED_DATA_DIR) else 'Not found'}")
259
 
260
  try:
 
261
  client = QdrantClient(path=str(QDRANT_DIR))
262
- print("Successfully created Qdrant client")
263
 
264
  # Verify client works by getting collections
265
  try:
266
  collection_name = "kohavi_ab_testing_pdf_collection"
 
267
  collections = client.get_collections()
268
- print(f"Available collections: {collections}")
269
 
270
  # Check if our collection exists
271
  collection_exists = False
272
  for collection in collections.collections:
273
  if collection.name == collection_name:
274
  collection_exists = True
 
275
  break
276
 
277
  if not collection_exists:
278
  print(f"WARNING: Collection '{collection_name}' not found!")
279
  except Exception as e:
280
  print(f"Warning: Could not get collections: {str(e)}")
 
 
281
 
282
  return client
283
  except Exception as e:
284
  print(f"Error creating QdrantClient with path: {str(e)}")
 
 
285
 
286
  # Try alternative parameter
287
  try:
 
288
  client = QdrantClient(location=str(QDRANT_DIR))
289
  print("Successfully created QdrantClient with location parameter")
290
  return client
291
  except Exception as e2:
292
  print(f"Alternative initialization failed: {str(e2)}")
293
- raise
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  def rag_chain_node(query):
296
  """
@@ -307,36 +476,69 @@ def rag_chain_node(query):
307
 
308
  # Get embedding for the query
309
  embedding_model = get_embedding_model()
 
310
  query_embedding = embedding_model.embed_query(query)
 
311
 
312
  # Get documents
313
- print("Retrieving documents...")
314
  chunks = load_document_chunks()
 
315
 
316
  # Map of document IDs to actual documents
317
  docs_by_id = {i: doc for i, doc in enumerate(chunks)}
318
 
319
  # Search for relevant documents
320
- search_results = client.search(
321
- collection_name=collection_name,
322
- query_vector=query_embedding,
323
- limit=5
324
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
  # Convert search results to documents
327
  docs = []
 
328
  for result in search_results:
329
  doc_id = result.id
330
  if doc_id in docs_by_id:
331
  docs.append(docs_by_id[doc_id])
 
 
 
332
  except Exception as e:
333
  print(f"Error in document retrieval: {str(e)}")
 
 
334
  return "I'm having trouble retrieving relevant information. Please try again later.", []
335
 
336
  # 2. Extract sources from the documents
337
  sources = []
 
338
  for doc in docs:
339
  source_path = doc.metadata.get("source", "")
 
340
  filename = source_path.split("/")[-1] if "/" in source_path else source_path
341
 
342
  # Remove .pdf extension if present
@@ -348,6 +550,7 @@ def rag_chain_node(query):
348
  "page": doc.metadata.get("page", "unknown"),
349
  "type": "pdf"
350
  })
 
351
 
352
  # 3. Use the RAG chain to generate an answer
353
  if not docs:
@@ -356,6 +559,7 @@ def rag_chain_node(query):
356
 
357
  # Create context from documents
358
  context = "\n\n".join([doc.page_content for doc in docs])
 
359
 
360
  # Format the prompt with context and query
361
  formatted_prompt = rag_prompt.format(context=context, question=query)
@@ -363,10 +567,16 @@ def rag_chain_node(query):
363
  # Send to the model and parse the output
364
  print("Generating answer...")
365
  chat_model = get_chat_model()
366
- response = chat_model.invoke(formatted_prompt)
367
- response_text = response.content
368
-
369
- return response_text, sources
 
 
 
 
 
 
370
 
371
  def evaluate_response(query, response):
372
  """
 
32
  if not os.environ.get("OPENAI_API_KEY"):
33
  os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY_BACKUP", "")
34
 
35
+ # Debugging: Print current directory and its contents
36
+ print(f"Current directory: {os.getcwd()}")
37
+ print(f"Directory contents: {os.listdir('.')}")
38
+
39
+ # Find the processed data directory
40
+ # Try multiple possible paths
41
+ possible_paths = [
42
+ "processed_data",
43
+ "/app/processed_data",
44
+ "../processed_data",
45
+ "./processed_data",
46
+ "/home/user/app/processed_data"
47
+ ]
48
+
49
+ # Find the first path that exists
50
+ for path in possible_paths:
51
+ print(f"Checking path: {path}")
52
+ if os.path.exists(path):
53
+ PROCESSED_DATA_DIR = Path(path)
54
+ print(f"Found processed data at: {path}")
55
+ print(f"Contents: {os.listdir(path)}")
56
+ break
57
+ else:
58
+ # Default if none found
59
+ PROCESSED_DATA_DIR = Path("processed_data")
60
+ print(f"Using default processed data path: {PROCESSED_DATA_DIR}")
61
+
62
+ # Create directory if it doesn't exist (for logging)
63
+ if not os.path.exists(PROCESSED_DATA_DIR):
64
+ os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
65
+ print(f"Created directory: {PROCESSED_DATA_DIR}")
66
+
67
  # Paths to pre-processed data
 
68
  CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
69
  QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
70
 
71
+ # Print paths for debugging
72
+ print(f"CHUNKS_FILE path: {CHUNKS_FILE}")
73
+ print(f"CHUNKS_FILE exists: {os.path.exists(CHUNKS_FILE)}")
74
+ print(f"QDRANT_DIR path: {QDRANT_DIR}")
75
+ print(f"QDRANT_DIR exists: {os.path.exists(QDRANT_DIR)}")
76
+ if os.path.exists(QDRANT_DIR):
77
+ print(f"QDRANT_DIR contents: {os.listdir(QDRANT_DIR)}")
78
+
79
  # Define prompts exactly as in the notebook
80
  RAG_PROMPT = """
81
  CONTEXT:
 
111
  @st.cache_resource
112
  def load_document_chunks():
113
  """Load pre-processed document chunks from disk."""
114
+ print(f"Attempting to load document chunks from {CHUNKS_FILE}")
115
  if not os.path.exists(CHUNKS_FILE):
116
  print(f"WARNING: Chunks file not found at {CHUNKS_FILE}")
117
  print(f"Working directory contents: {os.listdir('.')}")
 
123
  with open(CHUNKS_FILE, 'rb') as f:
124
  chunks = pickle.load(f)
125
  print(f"Successfully loaded {len(chunks)} document chunks")
126
+ # Print first chunk to verify data
127
+ if chunks:
128
+ print(f"First chunk metadata: {chunks[0].metadata}")
129
  return chunks
130
  except Exception as e:
131
  print(f"Error loading document chunks: {str(e)}")
132
+ import traceback
133
+ traceback.print_exc()
134
  return []
135
 
136
  @st.cache_resource
137
  def get_chat_model():
138
  """Get the chat model for initial RAG."""
139
+ print("Initializing chat model...")
140
+ # Try multiple approaches to initialize the model
141
  try:
142
+ # Approach 1: Direct OpenAI client
143
+ print("Trying direct OpenAI client approach")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  try:
145
+ # Use direct OpenAI client to avoid proxy issues
146
+ openai_client = OpenAI()
 
 
147
 
148
+ # Create a wrapper that mimics LangChain's interface
149
+ class SimpleOpenAIWrapper:
150
  def invoke(self, messages):
151
+ print("Invoking SimpleOpenAIWrapper...")
152
+ # Convert LangChain messages to OpenAI format
153
+ openai_messages = []
154
+ for msg in messages:
155
+ role = "user"
156
+ if hasattr(msg, "type"):
157
+ role = "assistant" if msg.type == "ai" else "user"
158
+ openai_messages.append({
159
+ "role": role,
160
+ "content": msg.content
161
+ })
162
+
163
+ # Log what we're sending to OpenAI
164
+ print(f"Sending {len(openai_messages)} messages to OpenAI API")
165
+
166
+ # Call API directly
167
+ response = openai_client.chat.completions.create(
168
+ model="gpt-4.1-mini",
169
+ messages=openai_messages,
170
+ temperature=0
171
+ )
172
+
173
+ # Create response object with content attribute
174
+ class SimpleResponse:
175
+ def __init__(self, content):
176
+ self.content = content
177
+
178
+ result = SimpleResponse(response.choices[0].message.content)
179
+ print(f"Got response from OpenAI (length: {len(result.content)})")
180
+ return result
181
+
182
+ print("Successfully created SimpleOpenAIWrapper")
183
+ return SimpleOpenAIWrapper()
184
+ except Exception as e:
185
+ print(f"Direct OpenAI client approach failed: {str(e)}")
186
+ import traceback
187
+ traceback.print_exc()
188
+ raise
189
+
190
+ except Exception as outer_e:
191
+ print(f"First approach failed: {str(outer_e)}")
192
+
193
+ # Approach 2: Standard LangChain
194
+ try:
195
+ print("Trying standard LangChain approach")
196
+ model = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
197
+ print("Successfully created ChatOpenAI model")
198
+ return model
199
+ except Exception as e:
200
+ print(f"Standard LangChain approach failed: {str(e)}")
201
 
202
+ # Approach 3: Very minimal LangChain
203
+ try:
204
+ print("Trying minimal LangChain approach")
205
+ model = ChatOpenAI(model="gpt-3.5-turbo")
206
+ print("Successfully created minimal ChatOpenAI model")
207
+ return model
208
+ except Exception as e2:
209
+ print(f"Minimal LangChain also failed: {str(e2)}")
210
+
211
+ # Last resort: Dummy implementation
212
+ print("Using dummy model as last resort")
213
+ class DummyModel:
214
+ def invoke(self, messages):
215
+ print("WARNING: Using dummy model that returns fixed responses")
216
+ class DummyResponse:
217
+ def __init__(self):
218
+ self.content = "I apologize, but I'm unable to process your query right now due to a technical issue. The system administrators have been notified."
219
+ return DummyResponse()
220
+
221
+ return DummyModel()
222
 
223
  @st.cache_resource
224
  def get_agent_model():
225
  """Get the more powerful model for agent and evaluation."""
226
+ print("Initializing agent model...")
227
+ # Try multiple approaches to initialize the model
228
  try:
229
+ # Approach 1: Direct OpenAI client
230
+ print("Trying direct OpenAI client approach for agent model")
231
+ try:
232
+ # Use direct OpenAI client to avoid proxy issues
233
+ openai_client = OpenAI()
234
+
235
+ # Create a wrapper that mimics LangChain's interface
236
+ class SimpleOpenAIWrapper:
237
+ def invoke(self, messages):
238
+ print("Invoking agent SimpleOpenAIWrapper...")
239
+ # Convert LangChain messages to OpenAI format
240
+ openai_messages = []
241
+ for msg in messages:
242
+ role = "user"
243
+ if hasattr(msg, "type"):
244
+ role = "assistant" if msg.type == "ai" else "user"
245
+ openai_messages.append({
246
+ "role": role,
247
+ "content": msg.content
248
+ })
249
+
250
+ # Log what we're sending to OpenAI
251
+ print(f"Sending {len(openai_messages)} messages to OpenAI API (agent)")
252
+
253
+ # Call API directly with a more powerful model
254
+ response = openai_client.chat.completions.create(
255
+ model="gpt-4.1",
256
+ messages=openai_messages,
257
+ temperature=0
258
+ )
259
+
260
+ class SimpleResponse:
261
+ def __init__(self, content):
262
+ self.content = content
263
+
264
+ result = SimpleResponse(response.choices[0].message.content)
265
+ print(f"Got agent response from OpenAI (length: {len(result.content)})")
266
+ return result
267
+
268
+ print("Successfully created agent SimpleOpenAIWrapper")
269
+ return SimpleOpenAIWrapper()
270
+ except Exception as e:
271
+ print(f"Direct OpenAI client approach for agent failed: {str(e)}")
272
+ import traceback
273
+ traceback.print_exc()
274
+ raise
275
+
276
+ except Exception as outer_e:
277
+ print(f"First agent approach failed: {str(outer_e)}")
278
 
279
+ # Approach 2: Standard LangChain
 
 
280
  try:
281
+ print("Trying standard LangChain approach for agent")
282
+ model = ChatOpenAI(model="gpt-4.1", temperature=0)
283
+ print("Successfully created agent ChatOpenAI model")
284
+ return model
285
+ except Exception as e:
286
+ print(f"Standard LangChain approach for agent failed: {str(e)}")
287
+
288
+ # Approach 3: Very minimal LangChain with fallback model
289
  try:
290
+ print("Trying minimal LangChain approach for agent")
291
+ model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
292
+ print("Successfully created minimal agent ChatOpenAI model")
293
+ return model
294
+ except Exception as e2:
295
+ print(f"Minimal LangChain for agent also failed: {str(e2)}")
296
+
297
+ # Last resort: Dummy implementation
298
+ print("Using dummy agent model as last resort")
299
  class DummyModel:
300
  def invoke(self, messages):
301
+ print("WARNING: Using dummy agent model that returns fixed responses")
302
  class DummyResponse:
303
  def __init__(self):
304
+ self.content = "I apologize, but I'm unable to process complex queries right now due to a technical issue."
305
  return DummyResponse()
306
 
307
  return DummyModel()
 
309
  @st.cache_resource
310
  def get_embedding_model():
311
  """Get the embedding model."""
312
+ print("Initializing embedding model...")
313
  try:
314
+ # Approach 1: Direct OpenAI client
315
+ print("Trying direct OpenAI client approach for embeddings")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  try:
317
+ # Create an OpenAI client directly
318
+ openai_client = OpenAI()
 
319
 
320
+ # Create a wrapper class that matches the interface LangChain expects
321
+ class SimpleEmbeddings:
322
  def embed_query(self, text):
323
+ print(f"Embedding query text (length: {len(text)})")
324
+ try:
325
+ response = openai_client.embeddings.create(
326
+ model="text-embedding-3-small",
327
+ input=text
328
+ )
329
+ print("Successfully got embedding from OpenAI API")
330
+ return response.data[0].embedding
331
+ except Exception as e:
332
+ print(f"Error in embed_query: {str(e)}")
333
+ import traceback
334
+ traceback.print_exc()
335
+ # Return a dummy embedding of the right size
336
+ print("WARNING: Returning dummy embedding vector")
337
+ return [0.0] * 1536 # Standard size for embeddings
338
 
339
  def embed_documents(self, texts):
340
+ print(f"Embedding {len(texts)} documents")
341
+ try:
342
+ if not texts:
343
+ return []
344
+
345
+ # Embed each text individually to avoid batch size issues
346
+ results = []
347
+ for i, text in enumerate(texts):
348
+ print(f"Embedding document {i+1}/{len(texts)}")
349
+ results.append(self.embed_query(text))
350
+ return results
351
+ except Exception as e:
352
+ print(f"Error in embed_documents: {str(e)}")
353
+ import traceback
354
+ traceback.print_exc()
355
+ # Return dummy embeddings
356
+ print("WARNING: Returning dummy document embeddings")
357
+ return [[0.0] * 1536 for _ in range(len(texts))]
358
+
359
+ print("Successfully created SimpleEmbeddings")
360
+ return SimpleEmbeddings()
361
+ except Exception as e:
362
+ print(f"Direct OpenAI client approach for embeddings failed: {str(e)}")
363
+ import traceback
364
+ traceback.print_exc()
365
+ raise
366
+
367
+ except Exception as outer_e:
368
+ print(f"First embedding approach failed: {str(outer_e)}")
369
+
370
+ # Approach 2: Standard LangChain OpenAIEmbeddings
371
+ try:
372
+ print("Trying standard LangChain approach for embeddings")
373
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
374
+ print("Successfully created OpenAIEmbeddings")
375
+ return embeddings
376
+ except Exception as e:
377
+ print(f"Standard OpenAIEmbeddings failed: {str(e)}")
378
 
379
+ # Approach 3: Very minimal OpenAIEmbeddings
380
+ try:
381
+ print("Trying minimal OpenAIEmbeddings")
382
+ embeddings = OpenAIEmbeddings()
383
+ print("Successfully created minimal OpenAIEmbeddings")
384
+ return embeddings
385
+ except Exception as e2:
386
+ print(f"Minimal OpenAIEmbeddings failed: {str(e2)}")
387
+
388
+ # Last resort: Dummy implementation
389
+ print("Using dummy embeddings as last resort")
390
+ class DummyEmbeddings:
391
+ def embed_query(self, text):
392
+ print("WARNING: Using dummy embeddings")
393
+ return [0.0] * 1536
394
+
395
+ def embed_documents(self, texts):
396
+ print("WARNING: Using dummy document embeddings")
397
+ return [[0.0] * 1536 for _ in range(len(texts))]
398
+
399
+ return DummyEmbeddings()
400
 
401
  @st.cache_resource
402
  def setup_qdrant_client():
403
  """Set up the Qdrant client."""
404
+ print(f"Attempting to setup Qdrant client with path: {QDRANT_DIR}")
405
  # Check if Qdrant dir exists
406
  if not os.path.exists(QDRANT_DIR):
407
  print(f"WARNING: Qdrant directory not found: {QDRANT_DIR}")
408
  print(f"Contents of {PROCESSED_DATA_DIR}: {os.listdir(PROCESSED_DATA_DIR) if os.path.exists(PROCESSED_DATA_DIR) else 'Not found'}")
409
 
410
  try:
411
+ print("Trying to create QdrantClient with path parameter")
412
  client = QdrantClient(path=str(QDRANT_DIR))
413
+ print("Successfully created Qdrant client with path parameter")
414
 
415
  # Verify client works by getting collections
416
  try:
417
  collection_name = "kohavi_ab_testing_pdf_collection"
418
+ print(f"Trying to get collections from Qdrant")
419
  collections = client.get_collections()
420
+ print(f"Available collections: {collections.collections}")
421
 
422
  # Check if our collection exists
423
  collection_exists = False
424
  for collection in collections.collections:
425
  if collection.name == collection_name:
426
  collection_exists = True
427
+ print(f"Found our collection: {collection_name}")
428
  break
429
 
430
  if not collection_exists:
431
  print(f"WARNING: Collection '{collection_name}' not found!")
432
  except Exception as e:
433
  print(f"Warning: Could not get collections: {str(e)}")
434
+ import traceback
435
+ traceback.print_exc()
436
 
437
  return client
438
  except Exception as e:
439
  print(f"Error creating QdrantClient with path: {str(e)}")
440
+ import traceback
441
+ traceback.print_exc()
442
 
443
  # Try alternative parameter
444
  try:
445
+ print("Trying to create QdrantClient with location parameter")
446
  client = QdrantClient(location=str(QDRANT_DIR))
447
  print("Successfully created QdrantClient with location parameter")
448
  return client
449
  except Exception as e2:
450
  print(f"Alternative initialization failed: {str(e2)}")
451
+
452
+ # Try in-memory as last resort (for testing)
453
+ try:
454
+ print("FALLBACK: Creating in-memory QdrantClient")
455
+ client = QdrantClient(":memory:")
456
+ print("Created in-memory QdrantClient as fallback")
457
+ return client
458
+ except Exception as e3:
459
+ print(f"Even in-memory Qdrant failed: {str(e3)}")
460
+ import traceback
461
+ traceback.print_exc()
462
+ raise
463
 
464
  def rag_chain_node(query):
465
  """
 
476
 
477
  # Get embedding for the query
478
  embedding_model = get_embedding_model()
479
+ print("Getting embedding for query...")
480
  query_embedding = embedding_model.embed_query(query)
481
+ print(f"Generated embedding of length: {len(query_embedding)}")
482
 
483
  # Get documents
484
+ print("Loading document chunks...")
485
  chunks = load_document_chunks()
486
+ print(f"Loaded {len(chunks)} document chunks")
487
 
488
  # Map of document IDs to actual documents
489
  docs_by_id = {i: doc for i, doc in enumerate(chunks)}
490
 
491
  # Search for relevant documents
492
+ print(f"Searching collection '{collection_name}' for documents...")
493
+ try:
494
+ # First try using query_points (preferred method)
495
+ print("Trying query_points method first...")
496
+ search_results = client.query_points(
497
+ collection_name=collection_name,
498
+ query_vector=query_embedding,
499
+ limit=5
500
+ )
501
+ print(f"Found {len(search_results)} results using query_points method")
502
+ except Exception as e1:
503
+ print(f"query_points method failed: {str(e1)}")
504
+
505
+ # Fall back to search method
506
+ print("Falling back to search method...")
507
+ try:
508
+ search_results = client.search(
509
+ collection_name=collection_name,
510
+ query_vector=query_embedding,
511
+ limit=5
512
+ )
513
+ print(f"Found {len(search_results)} results using search method")
514
+ except Exception as e2:
515
+ print(f"Both query methods failed: {str(e2)}")
516
+ import traceback
517
+ traceback.print_exc()
518
+ raise
519
 
520
  # Convert search results to documents
521
  docs = []
522
+ print("Processing search results...")
523
  for result in search_results:
524
  doc_id = result.id
525
  if doc_id in docs_by_id:
526
  docs.append(docs_by_id[doc_id])
527
+ print(f"Added doc with ID {doc_id}")
528
+ else:
529
+ print(f"Warning: Doc ID {doc_id} not found in loaded chunks")
530
  except Exception as e:
531
  print(f"Error in document retrieval: {str(e)}")
532
+ import traceback
533
+ traceback.print_exc()
534
  return "I'm having trouble retrieving relevant information. Please try again later.", []
535
 
536
  # 2. Extract sources from the documents
537
  sources = []
538
+ print(f"Extracting sources from {len(docs)} documents...")
539
  for doc in docs:
540
  source_path = doc.metadata.get("source", "")
541
+ print(f"Processing source: {source_path}")
542
  filename = source_path.split("/")[-1] if "/" in source_path else source_path
543
 
544
  # Remove .pdf extension if present
 
550
  "page": doc.metadata.get("page", "unknown"),
551
  "type": "pdf"
552
  })
553
+ print(f"Added source: {filename}, Page: {doc.metadata.get('page', 'unknown')}")
554
 
555
  # 3. Use the RAG chain to generate an answer
556
  if not docs:
 
559
 
560
  # Create context from documents
561
  context = "\n\n".join([doc.page_content for doc in docs])
562
+ print(f"Created context of length: {len(context)}")
563
 
564
  # Format the prompt with context and query
565
  formatted_prompt = rag_prompt.format(context=context, question=query)
 
567
  # Send to the model and parse the output
568
  print("Generating answer...")
569
  chat_model = get_chat_model()
570
+ try:
571
+ response = chat_model.invoke(formatted_prompt)
572
+ response_text = response.content
573
+ print(f"Generated response of length: {len(response_text)}")
574
+ return response_text, sources
575
+ except Exception as e:
576
+ print(f"Error generating response: {str(e)}")
577
+ import traceback
578
+ traceback.print_exc()
579
+ return "I encountered an error while generating a response. Please try again.", sources
580
 
581
  def evaluate_response(query, response):
582
  """
verify_data.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ # Check various possible locations
5
+ possible_paths = [
6
+ "processed_data",
7
+ "/app/processed_data",
8
+ "../processed_data",
9
+ "./processed_data"
10
+ ]
11
+
12
+ for path in possible_paths:
13
+ chunks_file = Path(path) / "document_chunks.pkl"
14
+ qdrant_dir = Path(path) / "qdrant_vectorstore"
15
+
16
+ print(f"Checking path: {path}")
17
+ print(f" Exists?: {os.path.exists(path)}")
18
+
19
+ if os.path.exists(path):
20
+ print(f" Contents: {os.listdir(path)}")
21
+ print(f" Chunks file exists?: {os.path.exists(chunks_file)}")
22
+ print(f" Qdrant dir exists?: {os.path.exists(qdrant_dir)}")
23
+
24
+ if os.path.exists(qdrant_dir):
25
+ print(f" Qdrant contents: {os.listdir(qdrant_dir)}")
26
+
27
+ # Show current working directory and its contents
28
+ print(f"Current directory: {os.getcwd()}")
29
+ print(f"Contents: {os.listdir('.')}")