Spaces:
Sleeping
Sleeping
Add debugging code about the preprocessed data
Browse files- .gitignore +34 -3
- streamlit_app.py +350 -140
- verify_data.py +29 -0
.gitignore
CHANGED
|
@@ -1,4 +1,35 @@
|
|
| 1 |
-
|
| 2 |
*.pdf
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore PDF files
|
| 2 |
*.pdf
|
| 3 |
+
|
| 4 |
+
# Keep processed data
|
| 5 |
+
!processed_data/
|
| 6 |
+
!processed_data/document_chunks.pkl
|
| 7 |
+
!processed_data/qdrant_vectorstore/
|
| 8 |
+
|
| 9 |
+
# Ignore notebook version folder
|
| 10 |
+
notebook_version/
|
| 11 |
+
|
| 12 |
+
# Python
|
| 13 |
+
__pycache__/
|
| 14 |
+
*.py[cod]
|
| 15 |
+
*$py.class
|
| 16 |
+
*.so
|
| 17 |
+
.Python
|
| 18 |
+
env/
|
| 19 |
+
build/
|
| 20 |
+
develop-eggs/
|
| 21 |
+
dist/
|
| 22 |
+
downloads/
|
| 23 |
+
eggs/
|
| 24 |
+
.eggs/
|
| 25 |
+
lib/
|
| 26 |
+
lib64/
|
| 27 |
+
parts/
|
| 28 |
+
sdist/
|
| 29 |
+
var/
|
| 30 |
+
*.egg-info/
|
| 31 |
+
.installed.cfg
|
| 32 |
+
*.egg
|
| 33 |
+
|
| 34 |
+
# Jupyter
|
| 35 |
+
.ipynb_checkpoints
|
streamlit_app.py
CHANGED
|
@@ -32,11 +32,50 @@ print("Loaded .env file")
|
|
| 32 |
if not os.environ.get("OPENAI_API_KEY"):
|
| 33 |
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY_BACKUP", "")
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
# Paths to pre-processed data
|
| 36 |
-
PROCESSED_DATA_DIR = Path("processed_data")
|
| 37 |
CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
|
| 38 |
QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# Define prompts exactly as in the notebook
|
| 41 |
RAG_PROMPT = """
|
| 42 |
CONTEXT:
|
|
@@ -72,6 +111,7 @@ evaluate_prompt = PromptTemplate.from_template(EVALUATE_RESPONSE_PROMPT)
|
|
| 72 |
@st.cache_resource
|
| 73 |
def load_document_chunks():
|
| 74 |
"""Load pre-processed document chunks from disk."""
|
|
|
|
| 75 |
if not os.path.exists(CHUNKS_FILE):
|
| 76 |
print(f"WARNING: Chunks file not found at {CHUNKS_FILE}")
|
| 77 |
print(f"Working directory contents: {os.listdir('.')}")
|
|
@@ -83,116 +123,185 @@ def load_document_chunks():
|
|
| 83 |
with open(CHUNKS_FILE, 'rb') as f:
|
| 84 |
chunks = pickle.load(f)
|
| 85 |
print(f"Successfully loaded {len(chunks)} document chunks")
|
|
|
|
|
|
|
|
|
|
| 86 |
return chunks
|
| 87 |
except Exception as e:
|
| 88 |
print(f"Error loading document chunks: {str(e)}")
|
|
|
|
|
|
|
| 89 |
return []
|
| 90 |
|
| 91 |
@st.cache_resource
|
| 92 |
def get_chat_model():
|
| 93 |
"""Get the chat model for initial RAG."""
|
|
|
|
|
|
|
| 94 |
try:
|
| 95 |
-
#
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
# Create a wrapper that mimics LangChain's interface
|
| 99 |
-
class SimpleOpenAIWrapper:
|
| 100 |
-
def invoke(self, messages):
|
| 101 |
-
# Convert LangChain messages to OpenAI format
|
| 102 |
-
openai_messages = []
|
| 103 |
-
for msg in messages:
|
| 104 |
-
role = "user"
|
| 105 |
-
if hasattr(msg, "type"):
|
| 106 |
-
role = "assistant" if msg.type == "ai" else "user"
|
| 107 |
-
openai_messages.append({
|
| 108 |
-
"role": role,
|
| 109 |
-
"content": msg.content
|
| 110 |
-
})
|
| 111 |
-
|
| 112 |
-
# Call API directly
|
| 113 |
-
response = openai_client.chat.completions.create(
|
| 114 |
-
model="gpt-4.1-mini",
|
| 115 |
-
messages=openai_messages,
|
| 116 |
-
temperature=0
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
# Create response object with content attribute
|
| 120 |
-
class SimpleResponse:
|
| 121 |
-
def __init__(self, content):
|
| 122 |
-
self.content = content
|
| 123 |
-
|
| 124 |
-
return SimpleResponse(response.choices[0].message.content)
|
| 125 |
-
|
| 126 |
-
return SimpleOpenAIWrapper()
|
| 127 |
-
except Exception as e:
|
| 128 |
-
print(f"Error creating OpenAI wrapper: {str(e)}")
|
| 129 |
try:
|
| 130 |
-
#
|
| 131 |
-
|
| 132 |
-
except Exception as e2:
|
| 133 |
-
print(f"Fallback also failed: {str(e2)}")
|
| 134 |
|
| 135 |
-
# Create
|
| 136 |
-
class
|
| 137 |
def invoke(self, messages):
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
@st.cache_resource
|
| 146 |
def get_agent_model():
|
| 147 |
"""Get the more powerful model for agent and evaluation."""
|
|
|
|
|
|
|
| 148 |
try:
|
| 149 |
-
#
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
openai_messages
|
| 161 |
-
|
| 162 |
-
"
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
-
|
| 179 |
-
except Exception as e:
|
| 180 |
-
print(f"Error creating agent model: {str(e)}")
|
| 181 |
try:
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
| 187 |
try:
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
class DummyModel:
|
| 192 |
def invoke(self, messages):
|
|
|
|
| 193 |
class DummyResponse:
|
| 194 |
def __init__(self):
|
| 195 |
-
self.content = "I apologize, but I'm unable to process
|
| 196 |
return DummyResponse()
|
| 197 |
|
| 198 |
return DummyModel()
|
|
@@ -200,97 +309,157 @@ def get_agent_model():
|
|
| 200 |
@st.cache_resource
|
| 201 |
def get_embedding_model():
|
| 202 |
"""Get the embedding model."""
|
|
|
|
| 203 |
try:
|
| 204 |
-
#
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
# Create a wrapper class that matches the interface LangChain expects
|
| 208 |
-
class SimpleEmbeddings:
|
| 209 |
-
def embed_query(self, text):
|
| 210 |
-
try:
|
| 211 |
-
response = openai_client.embeddings.create(
|
| 212 |
-
model="text-embedding-3-small",
|
| 213 |
-
input=text
|
| 214 |
-
)
|
| 215 |
-
return response.data[0].embedding
|
| 216 |
-
except Exception as e:
|
| 217 |
-
print(f"Error in embed_query: {str(e)}")
|
| 218 |
-
# Return a dummy embedding of the right size
|
| 219 |
-
return [0.0] * 1536 # Standard size for embeddings
|
| 220 |
-
|
| 221 |
-
def embed_documents(self, texts):
|
| 222 |
-
try:
|
| 223 |
-
if not texts:
|
| 224 |
-
return []
|
| 225 |
-
|
| 226 |
-
# Embed each text individually to avoid batch size issues
|
| 227 |
-
return [self.embed_query(text) for text in texts]
|
| 228 |
-
except Exception as e:
|
| 229 |
-
print(f"Error in embed_documents: {str(e)}")
|
| 230 |
-
# Return dummy embeddings
|
| 231 |
-
return [[0.0] * 1536 for _ in range(len(texts))]
|
| 232 |
-
|
| 233 |
-
return SimpleEmbeddings()
|
| 234 |
-
except Exception as e:
|
| 235 |
-
print(f"Error initializing embedding model: {str(e)}")
|
| 236 |
-
# Last resort fallback
|
| 237 |
try:
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
print(f"Embedding fallback also failed: {str(e2)}")
|
| 241 |
|
| 242 |
-
#
|
| 243 |
-
class
|
| 244 |
def embed_query(self, text):
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
def embed_documents(self, texts):
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
@st.cache_resource
|
| 253 |
def setup_qdrant_client():
|
| 254 |
"""Set up the Qdrant client."""
|
|
|
|
| 255 |
# Check if Qdrant dir exists
|
| 256 |
if not os.path.exists(QDRANT_DIR):
|
| 257 |
print(f"WARNING: Qdrant directory not found: {QDRANT_DIR}")
|
| 258 |
print(f"Contents of {PROCESSED_DATA_DIR}: {os.listdir(PROCESSED_DATA_DIR) if os.path.exists(PROCESSED_DATA_DIR) else 'Not found'}")
|
| 259 |
|
| 260 |
try:
|
|
|
|
| 261 |
client = QdrantClient(path=str(QDRANT_DIR))
|
| 262 |
-
print("Successfully created Qdrant client")
|
| 263 |
|
| 264 |
# Verify client works by getting collections
|
| 265 |
try:
|
| 266 |
collection_name = "kohavi_ab_testing_pdf_collection"
|
|
|
|
| 267 |
collections = client.get_collections()
|
| 268 |
-
print(f"Available collections: {collections}")
|
| 269 |
|
| 270 |
# Check if our collection exists
|
| 271 |
collection_exists = False
|
| 272 |
for collection in collections.collections:
|
| 273 |
if collection.name == collection_name:
|
| 274 |
collection_exists = True
|
|
|
|
| 275 |
break
|
| 276 |
|
| 277 |
if not collection_exists:
|
| 278 |
print(f"WARNING: Collection '{collection_name}' not found!")
|
| 279 |
except Exception as e:
|
| 280 |
print(f"Warning: Could not get collections: {str(e)}")
|
|
|
|
|
|
|
| 281 |
|
| 282 |
return client
|
| 283 |
except Exception as e:
|
| 284 |
print(f"Error creating QdrantClient with path: {str(e)}")
|
|
|
|
|
|
|
| 285 |
|
| 286 |
# Try alternative parameter
|
| 287 |
try:
|
|
|
|
| 288 |
client = QdrantClient(location=str(QDRANT_DIR))
|
| 289 |
print("Successfully created QdrantClient with location parameter")
|
| 290 |
return client
|
| 291 |
except Exception as e2:
|
| 292 |
print(f"Alternative initialization failed: {str(e2)}")
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
def rag_chain_node(query):
|
| 296 |
"""
|
|
@@ -307,36 +476,69 @@ def rag_chain_node(query):
|
|
| 307 |
|
| 308 |
# Get embedding for the query
|
| 309 |
embedding_model = get_embedding_model()
|
|
|
|
| 310 |
query_embedding = embedding_model.embed_query(query)
|
|
|
|
| 311 |
|
| 312 |
# Get documents
|
| 313 |
-
print("
|
| 314 |
chunks = load_document_chunks()
|
|
|
|
| 315 |
|
| 316 |
# Map of document IDs to actual documents
|
| 317 |
docs_by_id = {i: doc for i, doc in enumerate(chunks)}
|
| 318 |
|
| 319 |
# Search for relevant documents
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
|
| 326 |
# Convert search results to documents
|
| 327 |
docs = []
|
|
|
|
| 328 |
for result in search_results:
|
| 329 |
doc_id = result.id
|
| 330 |
if doc_id in docs_by_id:
|
| 331 |
docs.append(docs_by_id[doc_id])
|
|
|
|
|
|
|
|
|
|
| 332 |
except Exception as e:
|
| 333 |
print(f"Error in document retrieval: {str(e)}")
|
|
|
|
|
|
|
| 334 |
return "I'm having trouble retrieving relevant information. Please try again later.", []
|
| 335 |
|
| 336 |
# 2. Extract sources from the documents
|
| 337 |
sources = []
|
|
|
|
| 338 |
for doc in docs:
|
| 339 |
source_path = doc.metadata.get("source", "")
|
|
|
|
| 340 |
filename = source_path.split("/")[-1] if "/" in source_path else source_path
|
| 341 |
|
| 342 |
# Remove .pdf extension if present
|
|
@@ -348,6 +550,7 @@ def rag_chain_node(query):
|
|
| 348 |
"page": doc.metadata.get("page", "unknown"),
|
| 349 |
"type": "pdf"
|
| 350 |
})
|
|
|
|
| 351 |
|
| 352 |
# 3. Use the RAG chain to generate an answer
|
| 353 |
if not docs:
|
|
@@ -356,6 +559,7 @@ def rag_chain_node(query):
|
|
| 356 |
|
| 357 |
# Create context from documents
|
| 358 |
context = "\n\n".join([doc.page_content for doc in docs])
|
|
|
|
| 359 |
|
| 360 |
# Format the prompt with context and query
|
| 361 |
formatted_prompt = rag_prompt.format(context=context, question=query)
|
|
@@ -363,10 +567,16 @@ def rag_chain_node(query):
|
|
| 363 |
# Send to the model and parse the output
|
| 364 |
print("Generating answer...")
|
| 365 |
chat_model = get_chat_model()
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
def evaluate_response(query, response):
|
| 372 |
"""
|
|
|
|
| 32 |
if not os.environ.get("OPENAI_API_KEY"):
|
| 33 |
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY_BACKUP", "")
|
| 34 |
|
| 35 |
+
# Debugging: Print current directory and its contents
|
| 36 |
+
print(f"Current directory: {os.getcwd()}")
|
| 37 |
+
print(f"Directory contents: {os.listdir('.')}")
|
| 38 |
+
|
| 39 |
+
# Find the processed data directory
|
| 40 |
+
# Try multiple possible paths
|
| 41 |
+
possible_paths = [
|
| 42 |
+
"processed_data",
|
| 43 |
+
"/app/processed_data",
|
| 44 |
+
"../processed_data",
|
| 45 |
+
"./processed_data",
|
| 46 |
+
"/home/user/app/processed_data"
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
# Find the first path that exists
|
| 50 |
+
for path in possible_paths:
|
| 51 |
+
print(f"Checking path: {path}")
|
| 52 |
+
if os.path.exists(path):
|
| 53 |
+
PROCESSED_DATA_DIR = Path(path)
|
| 54 |
+
print(f"Found processed data at: {path}")
|
| 55 |
+
print(f"Contents: {os.listdir(path)}")
|
| 56 |
+
break
|
| 57 |
+
else:
|
| 58 |
+
# Default if none found
|
| 59 |
+
PROCESSED_DATA_DIR = Path("processed_data")
|
| 60 |
+
print(f"Using default processed data path: {PROCESSED_DATA_DIR}")
|
| 61 |
+
|
| 62 |
+
# Create directory if it doesn't exist (for logging)
|
| 63 |
+
if not os.path.exists(PROCESSED_DATA_DIR):
|
| 64 |
+
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
|
| 65 |
+
print(f"Created directory: {PROCESSED_DATA_DIR}")
|
| 66 |
+
|
| 67 |
# Paths to pre-processed data
|
|
|
|
| 68 |
CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
|
| 69 |
QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
|
| 70 |
|
| 71 |
+
# Print paths for debugging
|
| 72 |
+
print(f"CHUNKS_FILE path: {CHUNKS_FILE}")
|
| 73 |
+
print(f"CHUNKS_FILE exists: {os.path.exists(CHUNKS_FILE)}")
|
| 74 |
+
print(f"QDRANT_DIR path: {QDRANT_DIR}")
|
| 75 |
+
print(f"QDRANT_DIR exists: {os.path.exists(QDRANT_DIR)}")
|
| 76 |
+
if os.path.exists(QDRANT_DIR):
|
| 77 |
+
print(f"QDRANT_DIR contents: {os.listdir(QDRANT_DIR)}")
|
| 78 |
+
|
| 79 |
# Define prompts exactly as in the notebook
|
| 80 |
RAG_PROMPT = """
|
| 81 |
CONTEXT:
|
|
|
|
| 111 |
@st.cache_resource
|
| 112 |
def load_document_chunks():
|
| 113 |
"""Load pre-processed document chunks from disk."""
|
| 114 |
+
print(f"Attempting to load document chunks from {CHUNKS_FILE}")
|
| 115 |
if not os.path.exists(CHUNKS_FILE):
|
| 116 |
print(f"WARNING: Chunks file not found at {CHUNKS_FILE}")
|
| 117 |
print(f"Working directory contents: {os.listdir('.')}")
|
|
|
|
| 123 |
with open(CHUNKS_FILE, 'rb') as f:
|
| 124 |
chunks = pickle.load(f)
|
| 125 |
print(f"Successfully loaded {len(chunks)} document chunks")
|
| 126 |
+
# Print first chunk to verify data
|
| 127 |
+
if chunks:
|
| 128 |
+
print(f"First chunk metadata: {chunks[0].metadata}")
|
| 129 |
return chunks
|
| 130 |
except Exception as e:
|
| 131 |
print(f"Error loading document chunks: {str(e)}")
|
| 132 |
+
import traceback
|
| 133 |
+
traceback.print_exc()
|
| 134 |
return []
|
| 135 |
|
| 136 |
@st.cache_resource
|
| 137 |
def get_chat_model():
|
| 138 |
"""Get the chat model for initial RAG."""
|
| 139 |
+
print("Initializing chat model...")
|
| 140 |
+
# Try multiple approaches to initialize the model
|
| 141 |
try:
|
| 142 |
+
# Approach 1: Direct OpenAI client
|
| 143 |
+
print("Trying direct OpenAI client approach")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
try:
|
| 145 |
+
# Use direct OpenAI client to avoid proxy issues
|
| 146 |
+
openai_client = OpenAI()
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
# Create a wrapper that mimics LangChain's interface
|
| 149 |
+
class SimpleOpenAIWrapper:
|
| 150 |
def invoke(self, messages):
|
| 151 |
+
print("Invoking SimpleOpenAIWrapper...")
|
| 152 |
+
# Convert LangChain messages to OpenAI format
|
| 153 |
+
openai_messages = []
|
| 154 |
+
for msg in messages:
|
| 155 |
+
role = "user"
|
| 156 |
+
if hasattr(msg, "type"):
|
| 157 |
+
role = "assistant" if msg.type == "ai" else "user"
|
| 158 |
+
openai_messages.append({
|
| 159 |
+
"role": role,
|
| 160 |
+
"content": msg.content
|
| 161 |
+
})
|
| 162 |
+
|
| 163 |
+
# Log what we're sending to OpenAI
|
| 164 |
+
print(f"Sending {len(openai_messages)} messages to OpenAI API")
|
| 165 |
+
|
| 166 |
+
# Call API directly
|
| 167 |
+
response = openai_client.chat.completions.create(
|
| 168 |
+
model="gpt-4.1-mini",
|
| 169 |
+
messages=openai_messages,
|
| 170 |
+
temperature=0
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
# Create response object with content attribute
|
| 174 |
+
class SimpleResponse:
|
| 175 |
+
def __init__(self, content):
|
| 176 |
+
self.content = content
|
| 177 |
+
|
| 178 |
+
result = SimpleResponse(response.choices[0].message.content)
|
| 179 |
+
print(f"Got response from OpenAI (length: {len(result.content)})")
|
| 180 |
+
return result
|
| 181 |
+
|
| 182 |
+
print("Successfully created SimpleOpenAIWrapper")
|
| 183 |
+
return SimpleOpenAIWrapper()
|
| 184 |
+
except Exception as e:
|
| 185 |
+
print(f"Direct OpenAI client approach failed: {str(e)}")
|
| 186 |
+
import traceback
|
| 187 |
+
traceback.print_exc()
|
| 188 |
+
raise
|
| 189 |
+
|
| 190 |
+
except Exception as outer_e:
|
| 191 |
+
print(f"First approach failed: {str(outer_e)}")
|
| 192 |
+
|
| 193 |
+
# Approach 2: Standard LangChain
|
| 194 |
+
try:
|
| 195 |
+
print("Trying standard LangChain approach")
|
| 196 |
+
model = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
|
| 197 |
+
print("Successfully created ChatOpenAI model")
|
| 198 |
+
return model
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"Standard LangChain approach failed: {str(e)}")
|
| 201 |
|
| 202 |
+
# Approach 3: Very minimal LangChain
|
| 203 |
+
try:
|
| 204 |
+
print("Trying minimal LangChain approach")
|
| 205 |
+
model = ChatOpenAI(model="gpt-3.5-turbo")
|
| 206 |
+
print("Successfully created minimal ChatOpenAI model")
|
| 207 |
+
return model
|
| 208 |
+
except Exception as e2:
|
| 209 |
+
print(f"Minimal LangChain also failed: {str(e2)}")
|
| 210 |
+
|
| 211 |
+
# Last resort: Dummy implementation
|
| 212 |
+
print("Using dummy model as last resort")
|
| 213 |
+
class DummyModel:
|
| 214 |
+
def invoke(self, messages):
|
| 215 |
+
print("WARNING: Using dummy model that returns fixed responses")
|
| 216 |
+
class DummyResponse:
|
| 217 |
+
def __init__(self):
|
| 218 |
+
self.content = "I apologize, but I'm unable to process your query right now due to a technical issue. The system administrators have been notified."
|
| 219 |
+
return DummyResponse()
|
| 220 |
+
|
| 221 |
+
return DummyModel()
|
| 222 |
|
| 223 |
@st.cache_resource
|
| 224 |
def get_agent_model():
|
| 225 |
"""Get the more powerful model for agent and evaluation."""
|
| 226 |
+
print("Initializing agent model...")
|
| 227 |
+
# Try multiple approaches to initialize the model
|
| 228 |
try:
|
| 229 |
+
# Approach 1: Direct OpenAI client
|
| 230 |
+
print("Trying direct OpenAI client approach for agent model")
|
| 231 |
+
try:
|
| 232 |
+
# Use direct OpenAI client to avoid proxy issues
|
| 233 |
+
openai_client = OpenAI()
|
| 234 |
+
|
| 235 |
+
# Create a wrapper that mimics LangChain's interface
|
| 236 |
+
class SimpleOpenAIWrapper:
|
| 237 |
+
def invoke(self, messages):
|
| 238 |
+
print("Invoking agent SimpleOpenAIWrapper...")
|
| 239 |
+
# Convert LangChain messages to OpenAI format
|
| 240 |
+
openai_messages = []
|
| 241 |
+
for msg in messages:
|
| 242 |
+
role = "user"
|
| 243 |
+
if hasattr(msg, "type"):
|
| 244 |
+
role = "assistant" if msg.type == "ai" else "user"
|
| 245 |
+
openai_messages.append({
|
| 246 |
+
"role": role,
|
| 247 |
+
"content": msg.content
|
| 248 |
+
})
|
| 249 |
+
|
| 250 |
+
# Log what we're sending to OpenAI
|
| 251 |
+
print(f"Sending {len(openai_messages)} messages to OpenAI API (agent)")
|
| 252 |
+
|
| 253 |
+
# Call API directly with a more powerful model
|
| 254 |
+
response = openai_client.chat.completions.create(
|
| 255 |
+
model="gpt-4.1",
|
| 256 |
+
messages=openai_messages,
|
| 257 |
+
temperature=0
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
class SimpleResponse:
|
| 261 |
+
def __init__(self, content):
|
| 262 |
+
self.content = content
|
| 263 |
+
|
| 264 |
+
result = SimpleResponse(response.choices[0].message.content)
|
| 265 |
+
print(f"Got agent response from OpenAI (length: {len(result.content)})")
|
| 266 |
+
return result
|
| 267 |
+
|
| 268 |
+
print("Successfully created agent SimpleOpenAIWrapper")
|
| 269 |
+
return SimpleOpenAIWrapper()
|
| 270 |
+
except Exception as e:
|
| 271 |
+
print(f"Direct OpenAI client approach for agent failed: {str(e)}")
|
| 272 |
+
import traceback
|
| 273 |
+
traceback.print_exc()
|
| 274 |
+
raise
|
| 275 |
+
|
| 276 |
+
except Exception as outer_e:
|
| 277 |
+
print(f"First agent approach failed: {str(outer_e)}")
|
| 278 |
|
| 279 |
+
# Approach 2: Standard LangChain
|
|
|
|
|
|
|
| 280 |
try:
|
| 281 |
+
print("Trying standard LangChain approach for agent")
|
| 282 |
+
model = ChatOpenAI(model="gpt-4.1", temperature=0)
|
| 283 |
+
print("Successfully created agent ChatOpenAI model")
|
| 284 |
+
return model
|
| 285 |
+
except Exception as e:
|
| 286 |
+
print(f"Standard LangChain approach for agent failed: {str(e)}")
|
| 287 |
+
|
| 288 |
+
# Approach 3: Very minimal LangChain with fallback model
|
| 289 |
try:
|
| 290 |
+
print("Trying minimal LangChain approach for agent")
|
| 291 |
+
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
|
| 292 |
+
print("Successfully created minimal agent ChatOpenAI model")
|
| 293 |
+
return model
|
| 294 |
+
except Exception as e2:
|
| 295 |
+
print(f"Minimal LangChain for agent also failed: {str(e2)}")
|
| 296 |
+
|
| 297 |
+
# Last resort: Dummy implementation
|
| 298 |
+
print("Using dummy agent model as last resort")
|
| 299 |
class DummyModel:
|
| 300 |
def invoke(self, messages):
|
| 301 |
+
print("WARNING: Using dummy agent model that returns fixed responses")
|
| 302 |
class DummyResponse:
|
| 303 |
def __init__(self):
|
| 304 |
+
self.content = "I apologize, but I'm unable to process complex queries right now due to a technical issue."
|
| 305 |
return DummyResponse()
|
| 306 |
|
| 307 |
return DummyModel()
|
|
|
|
| 309 |
@st.cache_resource
|
| 310 |
def get_embedding_model():
|
| 311 |
"""Get the embedding model."""
|
| 312 |
+
print("Initializing embedding model...")
|
| 313 |
try:
|
| 314 |
+
# Approach 1: Direct OpenAI client
|
| 315 |
+
print("Trying direct OpenAI client approach for embeddings")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
try:
|
| 317 |
+
# Create an OpenAI client directly
|
| 318 |
+
openai_client = OpenAI()
|
|
|
|
| 319 |
|
| 320 |
+
# Create a wrapper class that matches the interface LangChain expects
|
| 321 |
+
class SimpleEmbeddings:
|
| 322 |
def embed_query(self, text):
|
| 323 |
+
print(f"Embedding query text (length: {len(text)})")
|
| 324 |
+
try:
|
| 325 |
+
response = openai_client.embeddings.create(
|
| 326 |
+
model="text-embedding-3-small",
|
| 327 |
+
input=text
|
| 328 |
+
)
|
| 329 |
+
print("Successfully got embedding from OpenAI API")
|
| 330 |
+
return response.data[0].embedding
|
| 331 |
+
except Exception as e:
|
| 332 |
+
print(f"Error in embed_query: {str(e)}")
|
| 333 |
+
import traceback
|
| 334 |
+
traceback.print_exc()
|
| 335 |
+
# Return a dummy embedding of the right size
|
| 336 |
+
print("WARNING: Returning dummy embedding vector")
|
| 337 |
+
return [0.0] * 1536 # Standard size for embeddings
|
| 338 |
|
| 339 |
def embed_documents(self, texts):
|
| 340 |
+
print(f"Embedding {len(texts)} documents")
|
| 341 |
+
try:
|
| 342 |
+
if not texts:
|
| 343 |
+
return []
|
| 344 |
+
|
| 345 |
+
# Embed each text individually to avoid batch size issues
|
| 346 |
+
results = []
|
| 347 |
+
for i, text in enumerate(texts):
|
| 348 |
+
print(f"Embedding document {i+1}/{len(texts)}")
|
| 349 |
+
results.append(self.embed_query(text))
|
| 350 |
+
return results
|
| 351 |
+
except Exception as e:
|
| 352 |
+
print(f"Error in embed_documents: {str(e)}")
|
| 353 |
+
import traceback
|
| 354 |
+
traceback.print_exc()
|
| 355 |
+
# Return dummy embeddings
|
| 356 |
+
print("WARNING: Returning dummy document embeddings")
|
| 357 |
+
return [[0.0] * 1536 for _ in range(len(texts))]
|
| 358 |
+
|
| 359 |
+
print("Successfully created SimpleEmbeddings")
|
| 360 |
+
return SimpleEmbeddings()
|
| 361 |
+
except Exception as e:
|
| 362 |
+
print(f"Direct OpenAI client approach for embeddings failed: {str(e)}")
|
| 363 |
+
import traceback
|
| 364 |
+
traceback.print_exc()
|
| 365 |
+
raise
|
| 366 |
+
|
| 367 |
+
except Exception as outer_e:
|
| 368 |
+
print(f"First embedding approach failed: {str(outer_e)}")
|
| 369 |
+
|
| 370 |
+
# Approach 2: Standard LangChain OpenAIEmbeddings
|
| 371 |
+
try:
|
| 372 |
+
print("Trying standard LangChain approach for embeddings")
|
| 373 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 374 |
+
print("Successfully created OpenAIEmbeddings")
|
| 375 |
+
return embeddings
|
| 376 |
+
except Exception as e:
|
| 377 |
+
print(f"Standard OpenAIEmbeddings failed: {str(e)}")
|
| 378 |
|
| 379 |
+
# Approach 3: Very minimal OpenAIEmbeddings
|
| 380 |
+
try:
|
| 381 |
+
print("Trying minimal OpenAIEmbeddings")
|
| 382 |
+
embeddings = OpenAIEmbeddings()
|
| 383 |
+
print("Successfully created minimal OpenAIEmbeddings")
|
| 384 |
+
return embeddings
|
| 385 |
+
except Exception as e2:
|
| 386 |
+
print(f"Minimal OpenAIEmbeddings failed: {str(e2)}")
|
| 387 |
+
|
| 388 |
+
# Last resort: Dummy implementation
|
| 389 |
+
print("Using dummy embeddings as last resort")
|
| 390 |
+
class DummyEmbeddings:
|
| 391 |
+
def embed_query(self, text):
|
| 392 |
+
print("WARNING: Using dummy embeddings")
|
| 393 |
+
return [0.0] * 1536
|
| 394 |
+
|
| 395 |
+
def embed_documents(self, texts):
|
| 396 |
+
print("WARNING: Using dummy document embeddings")
|
| 397 |
+
return [[0.0] * 1536 for _ in range(len(texts))]
|
| 398 |
+
|
| 399 |
+
return DummyEmbeddings()
|
| 400 |
|
| 401 |
@st.cache_resource
|
| 402 |
def setup_qdrant_client():
|
| 403 |
"""Set up the Qdrant client."""
|
| 404 |
+
print(f"Attempting to setup Qdrant client with path: {QDRANT_DIR}")
|
| 405 |
# Check if Qdrant dir exists
|
| 406 |
if not os.path.exists(QDRANT_DIR):
|
| 407 |
print(f"WARNING: Qdrant directory not found: {QDRANT_DIR}")
|
| 408 |
print(f"Contents of {PROCESSED_DATA_DIR}: {os.listdir(PROCESSED_DATA_DIR) if os.path.exists(PROCESSED_DATA_DIR) else 'Not found'}")
|
| 409 |
|
| 410 |
try:
|
| 411 |
+
print("Trying to create QdrantClient with path parameter")
|
| 412 |
client = QdrantClient(path=str(QDRANT_DIR))
|
| 413 |
+
print("Successfully created Qdrant client with path parameter")
|
| 414 |
|
| 415 |
# Verify client works by getting collections
|
| 416 |
try:
|
| 417 |
collection_name = "kohavi_ab_testing_pdf_collection"
|
| 418 |
+
print(f"Trying to get collections from Qdrant")
|
| 419 |
collections = client.get_collections()
|
| 420 |
+
print(f"Available collections: {collections.collections}")
|
| 421 |
|
| 422 |
# Check if our collection exists
|
| 423 |
collection_exists = False
|
| 424 |
for collection in collections.collections:
|
| 425 |
if collection.name == collection_name:
|
| 426 |
collection_exists = True
|
| 427 |
+
print(f"Found our collection: {collection_name}")
|
| 428 |
break
|
| 429 |
|
| 430 |
if not collection_exists:
|
| 431 |
print(f"WARNING: Collection '{collection_name}' not found!")
|
| 432 |
except Exception as e:
|
| 433 |
print(f"Warning: Could not get collections: {str(e)}")
|
| 434 |
+
import traceback
|
| 435 |
+
traceback.print_exc()
|
| 436 |
|
| 437 |
return client
|
| 438 |
except Exception as e:
|
| 439 |
print(f"Error creating QdrantClient with path: {str(e)}")
|
| 440 |
+
import traceback
|
| 441 |
+
traceback.print_exc()
|
| 442 |
|
| 443 |
# Try alternative parameter
|
| 444 |
try:
|
| 445 |
+
print("Trying to create QdrantClient with location parameter")
|
| 446 |
client = QdrantClient(location=str(QDRANT_DIR))
|
| 447 |
print("Successfully created QdrantClient with location parameter")
|
| 448 |
return client
|
| 449 |
except Exception as e2:
|
| 450 |
print(f"Alternative initialization failed: {str(e2)}")
|
| 451 |
+
|
| 452 |
+
# Try in-memory as last resort (for testing)
|
| 453 |
+
try:
|
| 454 |
+
print("FALLBACK: Creating in-memory QdrantClient")
|
| 455 |
+
client = QdrantClient(":memory:")
|
| 456 |
+
print("Created in-memory QdrantClient as fallback")
|
| 457 |
+
return client
|
| 458 |
+
except Exception as e3:
|
| 459 |
+
print(f"Even in-memory Qdrant failed: {str(e3)}")
|
| 460 |
+
import traceback
|
| 461 |
+
traceback.print_exc()
|
| 462 |
+
raise
|
| 463 |
|
| 464 |
def rag_chain_node(query):
|
| 465 |
"""
|
|
|
|
| 476 |
|
| 477 |
# Get embedding for the query
|
| 478 |
embedding_model = get_embedding_model()
|
| 479 |
+
print("Getting embedding for query...")
|
| 480 |
query_embedding = embedding_model.embed_query(query)
|
| 481 |
+
print(f"Generated embedding of length: {len(query_embedding)}")
|
| 482 |
|
| 483 |
# Get documents
|
| 484 |
+
print("Loading document chunks...")
|
| 485 |
chunks = load_document_chunks()
|
| 486 |
+
print(f"Loaded {len(chunks)} document chunks")
|
| 487 |
|
| 488 |
# Map of document IDs to actual documents
|
| 489 |
docs_by_id = {i: doc for i, doc in enumerate(chunks)}
|
| 490 |
|
| 491 |
# Search for relevant documents
|
| 492 |
+
print(f"Searching collection '{collection_name}' for documents...")
|
| 493 |
+
try:
|
| 494 |
+
# First try using query_points (preferred method)
|
| 495 |
+
print("Trying query_points method first...")
|
| 496 |
+
search_results = client.query_points(
|
| 497 |
+
collection_name=collection_name,
|
| 498 |
+
query_vector=query_embedding,
|
| 499 |
+
limit=5
|
| 500 |
+
)
|
| 501 |
+
print(f"Found {len(search_results)} results using query_points method")
|
| 502 |
+
except Exception as e1:
|
| 503 |
+
print(f"query_points method failed: {str(e1)}")
|
| 504 |
+
|
| 505 |
+
# Fall back to search method
|
| 506 |
+
print("Falling back to search method...")
|
| 507 |
+
try:
|
| 508 |
+
search_results = client.search(
|
| 509 |
+
collection_name=collection_name,
|
| 510 |
+
query_vector=query_embedding,
|
| 511 |
+
limit=5
|
| 512 |
+
)
|
| 513 |
+
print(f"Found {len(search_results)} results using search method")
|
| 514 |
+
except Exception as e2:
|
| 515 |
+
print(f"Both query methods failed: {str(e2)}")
|
| 516 |
+
import traceback
|
| 517 |
+
traceback.print_exc()
|
| 518 |
+
raise
|
| 519 |
|
| 520 |
# Convert search results to documents
|
| 521 |
docs = []
|
| 522 |
+
print("Processing search results...")
|
| 523 |
for result in search_results:
|
| 524 |
doc_id = result.id
|
| 525 |
if doc_id in docs_by_id:
|
| 526 |
docs.append(docs_by_id[doc_id])
|
| 527 |
+
print(f"Added doc with ID {doc_id}")
|
| 528 |
+
else:
|
| 529 |
+
print(f"Warning: Doc ID {doc_id} not found in loaded chunks")
|
| 530 |
except Exception as e:
|
| 531 |
print(f"Error in document retrieval: {str(e)}")
|
| 532 |
+
import traceback
|
| 533 |
+
traceback.print_exc()
|
| 534 |
return "I'm having trouble retrieving relevant information. Please try again later.", []
|
| 535 |
|
| 536 |
# 2. Extract sources from the documents
|
| 537 |
sources = []
|
| 538 |
+
print(f"Extracting sources from {len(docs)} documents...")
|
| 539 |
for doc in docs:
|
| 540 |
source_path = doc.metadata.get("source", "")
|
| 541 |
+
print(f"Processing source: {source_path}")
|
| 542 |
filename = source_path.split("/")[-1] if "/" in source_path else source_path
|
| 543 |
|
| 544 |
# Remove .pdf extension if present
|
|
|
|
| 550 |
"page": doc.metadata.get("page", "unknown"),
|
| 551 |
"type": "pdf"
|
| 552 |
})
|
| 553 |
+
print(f"Added source: {filename}, Page: {doc.metadata.get('page', 'unknown')}")
|
| 554 |
|
| 555 |
# 3. Use the RAG chain to generate an answer
|
| 556 |
if not docs:
|
|
|
|
| 559 |
|
| 560 |
# Create context from documents
|
| 561 |
context = "\n\n".join([doc.page_content for doc in docs])
|
| 562 |
+
print(f"Created context of length: {len(context)}")
|
| 563 |
|
| 564 |
# Format the prompt with context and query
|
| 565 |
formatted_prompt = rag_prompt.format(context=context, question=query)
|
|
|
|
| 567 |
# Send to the model and parse the output
|
| 568 |
print("Generating answer...")
|
| 569 |
chat_model = get_chat_model()
|
| 570 |
+
try:
|
| 571 |
+
response = chat_model.invoke(formatted_prompt)
|
| 572 |
+
response_text = response.content
|
| 573 |
+
print(f"Generated response of length: {len(response_text)}")
|
| 574 |
+
return response_text, sources
|
| 575 |
+
except Exception as e:
|
| 576 |
+
print(f"Error generating response: {str(e)}")
|
| 577 |
+
import traceback
|
| 578 |
+
traceback.print_exc()
|
| 579 |
+
return "I encountered an error while generating a response. Please try again.", sources
|
| 580 |
|
| 581 |
def evaluate_response(query, response):
|
| 582 |
"""
|
verify_data.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
# Check various possible locations
|
| 5 |
+
possible_paths = [
|
| 6 |
+
"processed_data",
|
| 7 |
+
"/app/processed_data",
|
| 8 |
+
"../processed_data",
|
| 9 |
+
"./processed_data"
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
for path in possible_paths:
|
| 13 |
+
chunks_file = Path(path) / "document_chunks.pkl"
|
| 14 |
+
qdrant_dir = Path(path) / "qdrant_vectorstore"
|
| 15 |
+
|
| 16 |
+
print(f"Checking path: {path}")
|
| 17 |
+
print(f" Exists?: {os.path.exists(path)}")
|
| 18 |
+
|
| 19 |
+
if os.path.exists(path):
|
| 20 |
+
print(f" Contents: {os.listdir(path)}")
|
| 21 |
+
print(f" Chunks file exists?: {os.path.exists(chunks_file)}")
|
| 22 |
+
print(f" Qdrant dir exists?: {os.path.exists(qdrant_dir)}")
|
| 23 |
+
|
| 24 |
+
if os.path.exists(qdrant_dir):
|
| 25 |
+
print(f" Qdrant contents: {os.listdir(qdrant_dir)}")
|
| 26 |
+
|
| 27 |
+
# Show current working directory and its contents
|
| 28 |
+
print(f"Current directory: {os.getcwd()}")
|
| 29 |
+
print(f"Contents: {os.listdir('.')}")
|