Spaces:

kamkol
/

AB_Testing_RAG_Agent

Sleeping

App Files Files Community

kamkol commited on Apr 30, 2025

Commit

0be47a9

1 Parent(s): 87e184e

Simplify OpenAIEmbeddings initialization and improve error handling

Browse files

Files changed (1) hide show

streamlit_app.py +110 -101

streamlit_app.py CHANGED Viewed

@@ -114,27 +114,21 @@ def get_embedding_model():
     from langchain_openai import OpenAIEmbeddings
     import os
-    # The simplest initialization possible - just model name
     try:
         return OpenAIEmbeddings(model="text-embedding-3-small")
     except Exception as e:
-        print(f"OpenAIEmbeddings initialization error: {str(e)}")
-        # Try with just API key, no other parameters
         try:
-            return OpenAIEmbeddings(
-                model="text-embedding-3-small",
-                openai_api_key=os.environ.get("OPENAI_API_KEY")
-            )
         except Exception as e2:
-            print(f"Second attempt failed: {str(e2)}")
-            # Last resort - most minimal initialization
-            return OpenAIEmbeddings(
-                model="text-embedding-3-small",
-                openai_api_key=os.environ.get("OPENAI_API_KEY"),
-                client=None  # Let the class create its own client
-            )
 @st.cache_resource
 def setup_qdrant_client():
@@ -160,100 +154,115 @@ def setup_qdrant_client():
 def retrieve_documents(query, k=5):
     """Retrieve relevant documents for a query."""
     # Get models and data
-    embedding_model = get_embedding_model()
-    chunks = load_document_chunks()
-    client = setup_qdrant_client()
-    # Create a mapping of IDs to documents
-    docs_by_id = {i: doc for i, doc in enumerate(chunks)}
-    # Get query embedding
-    query_embedding = embedding_model.embed_query(query)
-    # Search Qdrant
     try:
-        # Try the new API method first
-        results = client.query_points(
-            collection_name="kohavi_ab_testing_pdf_collection",
-            query_vector=query_embedding,
-            limit=k
-        )
-        print("Successfully used query_points method")
-    except Exception as e:
-        print(f"Error with query_points method: {str(e)}")
         try:
-            # Try a different parameter format
             results = client.query_points(
                 collection_name="kohavi_ab_testing_pdf_collection",
                 query_vector=query_embedding,
-                with_payload=True,
-                with_vectors=False,
                 limit=k
             )
-            print("Successfully used query_points with alternate parameters")
-        except Exception as e2:
-            print(f"Error with alternate query_points: {str(e2)}")
-            # Fall back to the deprecated method as last resort
-            results = client.search(
-                collection_name="kohavi_ab_testing_pdf_collection",
-                query_vector=query_embedding,
-                limit=k
-            )
-            print("Using deprecated search method")
-    # Convert results to documents
-    documents = []
-    sources_dict = {}  # Use a dictionary to track unique sources by file+page
-    print(f"Retrieved {len(results)} search results")
-    for result in results:
-        doc_id = result.id
-        if doc_id in docs_by_id:
-            doc = docs_by_id[doc_id]
-            documents.append(doc)
-            # Debug the metadata
-            print(f"Document metadata: {doc.metadata}")
-            # Extract source info
-            source_path = doc.metadata.get("source", "")
-            filename = source_path.split("/")[-1] if "/" in source_path else source_path
-            # Remove .pdf extension if present
-            if filename.lower().endswith('.pdf'):
-                filename = filename[:-4]
-            # Default to the full filename if we can't extract a title
-            if not filename:
-                filename = "Unknown Source"
-            # Get page number, use a default if not available
-            page = doc.metadata.get("page", "unknown")
-            # All PDF sources in data directory are by Ron Kohavi, so add his name as prefix
-            title = f"Ron Kohavi: {filename}"
-            # Create a unique key for this source based on filename and page
-            source_key = f"{filename}_{page}"
-            # Only add to sources if we haven't seen this exact source (same file, same page) before
-            if source_key not in sources_dict:
-                sources_dict[source_key] = {
-                    "title": title,
-                    "page": page,
-                    "score": float(result.score),
-                    "type": "pdf"
-                }
-                print(f"Added source: {title}, Page: {page}")
-            else:
-                print(f"Skipping duplicate source: {title}, Page: {page}")
-    # Convert the dictionary of unique sources back to a list
-    sources = list(sources_dict.values())
-    print(f"Returning {len(documents)} documents with {len(sources)} unique sources")
-    return documents, sources
 def rephrase_query(query):
     """Rephrase the query to improve retrieval."""

     from langchain_openai import OpenAIEmbeddings
     import os
+    # Simplest possible initialization
     try:
+        api_key = os.environ.get("OPENAI_API_KEY", "")
+        print(f"Using API key: {api_key[:4]}...{api_key[-4:] if len(api_key) > 8 else ''}")
+        # Most minimal initialization - one parameter only
         return OpenAIEmbeddings(model="text-embedding-3-small")
     except Exception as e:
+        print(f"Error initializing embeddings: {str(e)}")
+        # Try more minimal approach (in case model param is causing issues)
         try:
+            return OpenAIEmbeddings()
         except Exception as e2:
+            print(f"Final attempt to initialize embeddings failed: {str(e2)}")
+            raise
 @st.cache_resource
 def setup_qdrant_client():
 def retrieve_documents(query, k=5):
     """Retrieve relevant documents for a query."""
     # Get models and data
     try:
+        embedding_model = get_embedding_model()
+        chunks = load_document_chunks()
+        client = setup_qdrant_client()
+        # Create a mapping of IDs to documents
+        docs_by_id = {i: doc for i, doc in enumerate(chunks)}
+        # Get query embedding
+        query_embedding = embedding_model.embed_query(query)
+        # Try various search methods until one works
+        results = None
         try:
+            # Try simplest query_points call
             results = client.query_points(
                 collection_name="kohavi_ab_testing_pdf_collection",
                 query_vector=query_embedding,
                 limit=k
             )
+            print("Successfully used query_points method")
+        except Exception as e:
+            print(f"First query attempt failed: {str(e)}")
+            try:
+                # Try with explicit parameters
+                results = client.query_points(
+                    collection_name="kohavi_ab_testing_pdf_collection",
+                    query_vector=query_embedding,
+                    with_payload=True,
+                    limit=k
+                )
+                print("Successfully used query_points with explicit parameters")
+            except Exception as e2:
+                print(f"Second query attempt failed: {str(e2)}")
+                try:
+                    # Fall back to deprecated search method
+                    results = client.search(
+                        collection_name="kohavi_ab_testing_pdf_collection",
+                        query_vector=query_embedding,
+                        limit=k
+                    )
+                    print("Successfully used deprecated search method")
+                except Exception as e3:
+                    print(f"All query methods failed: {str(e3)}")
+                    # No results found - return empty list
+                    return [], []
+        # If we got here but results is still None, return empty lists
+        if results is None:
+            print("No results found with any query method")
+            return [], []
+        # Convert results to documents
+        documents = []
+        sources_dict = {}  # Use a dictionary to track unique sources by file+page
+        print(f"Retrieved {len(results)} search results")
+        for result in results:
+            doc_id = result.id
+            if doc_id in docs_by_id:
+                doc = docs_by_id[doc_id]
+                documents.append(doc)
+                # Debug the metadata
+                print(f"Document metadata: {doc.metadata}")
+                # Extract source info
+                source_path = doc.metadata.get("source", "")
+                filename = source_path.split("/")[-1] if "/" in source_path else source_path
+                # Remove .pdf extension if present
+                if filename.lower().endswith('.pdf'):
+                    filename = filename[:-4]
+                # Default to the full filename if we can't extract a title
+                if not filename:
+                    filename = "Unknown Source"
+                # Get page number, use a default if not available
+                page = doc.metadata.get("page", "unknown")
+                # All PDF sources in data directory are by Ron Kohavi, so add his name as prefix
+                title = f"Ron Kohavi: {filename}"
+                # Create a unique key for this source based on filename and page
+                source_key = f"{filename}_{page}"
+                # Only add to sources if we haven't seen this exact source (same file, same page) before
+                if source_key not in sources_dict:
+                    sources_dict[source_key] = {
+                        "title": title,
+                        "page": page,
+                        "score": float(result.score),
+                        "type": "pdf"
+                    }
+                    print(f"Added source: {title}, Page: {page}")
+                else:
+                    print(f"Skipping duplicate source: {title}, Page: {page}")
+        # Convert the dictionary of unique sources back to a list
+        sources = list(sources_dict.values())
+        print(f"Returning {len(documents)} documents with {len(sources)} unique sources")
+        return documents, sources
+    except Exception as e:
+        print(f"Error in retrieve_documents: {str(e)}")
+        # Return empty results in case of any error
+        return [], []
 def rephrase_query(query):
     """Rephrase the query to improve retrieval."""