Spaces:

vichudo
/

agentic-defensor

Sleeping

App Files Files Community

vichudo commited on Mar 18, 2025

Commit

254ca68

1 Parent(s): 0560e17

fix

Browse files

Files changed (4) hide show

download_from_hub.py +363 -60
requirements.txt +5 -1
src/embeddings/embedder.py +47 -2
src/models/retriever.py +43 -0

download_from_hub.py CHANGED Viewed

@@ -4,6 +4,8 @@ import pickle
 import sys
 import numpy as np
 from huggingface_hub import hf_hub_download, list_repo_files
 def ensure_dirs():
     """Create necessary directories if they don't exist."""
@@ -11,6 +13,64 @@ def ensure_dirs():
     os.makedirs("embeddings", exist_ok=True)
     os.makedirs("pdfs", exist_ok=True)
 def create_fallback_data():
     """Create minimal fallback data if downloads fail."""
     print("Creating fallback data files...")
@@ -18,20 +78,22 @@ def create_fallback_data():
     # Create minimal embeddings
     try:
         print("Creating fallback embeddings...")
-        # Create a small random matrix as embeddings (10 documents, 384 dimensions)
-        embeddings = np.random.random((10, 384)).astype(np.float32)
         with open("embeddings/embeddings.pkl", "wb") as f:
             pickle.dump(embeddings, f)
-        # Create a minimal FAISS index
         import faiss
-        dimension = 384
         index = faiss.IndexFlatL2(dimension)
         index.add(embeddings)
         faiss.write_index(index, "embeddings/faiss_index.index")
         print("Fallback embeddings and FAISS index created successfully!")
     except Exception as e:
         print(f"Error creating fallback embeddings: {e}")
         return False
     # Create minimal document chunks
@@ -51,47 +113,27 @@ def create_fallback_data():
         print("Fallback document chunks created successfully!")
     except Exception as e:
         print(f"Error creating fallback document chunks: {e}")
         return False
-    return True
 def download_datasets():
     """Download datasets from Hugging Face Hub."""
     print("Downloading data files from Hugging Face Hub...")
     download_success = True
-    # Download embeddings
-    try:
-        from datasets import load_dataset
-        print("Downloading embeddings...")
-        # First check what files are available in the dataset repository
-        try:
-            files = list_repo_files("vichudo/agentic-defensor-embeddings", repo_type="dataset")
-            print(f"Files in embeddings repository: {files}")
-        except Exception as e:
-            print(f"Error listing files in embeddings repository: {e}")
-        embeddings_ds = load_dataset("vichudo/agentic-defensor-embeddings", split="train")
-        print(f"Embeddings dataset info: {embeddings_ds}")
-        print(f"Embeddings dataset features: {embeddings_ds.features}")
-        print(f"First row of embeddings dataset: {embeddings_ds[0]}")
-        if "data" not in embeddings_ds[0]:
-            print("Error: No 'data' field found in embeddings dataset")
-            print(f"Available fields: {embeddings_ds[0].keys()}")
-            download_success = False
-        else:
-            embeddings_data = pickle.loads(embeddings_ds[0]["data"])
-            with open("embeddings/embeddings.pkl", "wb") as f:
-                pickle.dump(embeddings_data, f)
-            print("Embeddings downloaded and saved successfully!")
-    except Exception as e:
-        print(f"Error downloading embeddings: {e}")
-        download_success = False
-    # Download FAISS index
     try:
-        print("Downloading FAISS index...")
         # Try direct file download
         try:
             faiss_path = hf_hub_download(
@@ -99,9 +141,15 @@ def download_datasets():
                 filename="faiss_index.index",
                 repo_type="dataset"
             )
-            # Copy to correct location
-            os.system(f"cp {faiss_path} embeddings/faiss_index.index")
-            print("FAISS index downloaded and saved successfully!")
         except Exception as e:
             print(f"Direct download of FAISS index failed: {e}")
             # Try alternate approach using dataset API
@@ -113,44 +161,278 @@ def download_datasets():
                     import faiss
                     faiss.write_index(embeddings_ds.faiss_index, "embeddings/faiss_index.index")
                     print("FAISS index from dataset attributes saved successfully!")
                 else:
-                    raise ValueError("No FAISS index found in dataset attributes")
             except Exception as inner_e:
                 print(f"Alternative FAISS index download failed: {inner_e}")
-                raise
     except Exception as e:
         print(f"Error downloading FAISS index: {e}")
         download_success = False
     # Download document chunks
     try:
-        from datasets import load_dataset
-        print("Downloading document chunks...")
         # First check what files are available
         try:
             files = list_repo_files("vichudo/agentic-defensor-chunks", repo_type="dataset")
             print(f"Files in chunks repository: {files}")
         except Exception as e:
             print(f"Error listing files in chunks repository: {e}")
-        chunks_ds = load_dataset("vichudo/agentic-defensor-chunks", split="train")
-        print(f"Chunks dataset info: {chunks_ds}")
-        print(f"Chunks dataset features: {chunks_ds.features}")
-        print(f"First row of chunks dataset: {chunks_ds[0]}")
-        if "data" not in chunks_ds[0]:
-            print("Error: No 'data' field found in chunks dataset")
-            print(f"Available fields: {chunks_ds[0].keys()}")
-            download_success = False
-        else:
-            chunks_data = pickle.loads(chunks_ds[0]["data"])
-            with open("data/doc_chunks.pkl", "wb") as f:
-                pickle.dump(chunks_data, f)
-            print("Document chunks downloaded and saved successfully!")
     except Exception as e:
         print(f"Error downloading document chunks: {e}")
         download_success = False
     return download_success
 if __name__ == "__main__":
@@ -159,12 +441,33 @@ if __name__ == "__main__":
     # If download fails, create fallback data
     if not success:
-        print("Downloads failed. Creating fallback data...")
         success = create_fallback_data()
     if success:
-        print("Data files setup completed successfully!")
-        sys.exit(0)
     else:
-        print("Failed to set up data files.")
         sys.exit(1)

 import sys
 import numpy as np
 from huggingface_hub import hf_hub_download, list_repo_files
+import traceback
+import shutil
 def ensure_dirs():
     """Create necessary directories if they don't exist."""
     os.makedirs("embeddings", exist_ok=True)
     os.makedirs("pdfs", exist_ok=True)
+def verify_embeddings_faiss_compatibility():
+    """Verify that the downloaded embeddings and FAISS index are compatible."""
+    print("Verifying compatibility between embeddings and FAISS index...")
+    try:
+        # Check if files exist
+        if not os.path.exists("embeddings/embeddings.pkl"):
+            print("Error: embeddings.pkl does not exist")
+            return False
+        if not os.path.exists("embeddings/faiss_index.index"):
+            print("Error: faiss_index.index does not exist")
+            return False
+        # Load embeddings
+        with open("embeddings/embeddings.pkl", "rb") as f:
+            embeddings = pickle.load(f)
+        print(f"Loaded embeddings with shape: {embeddings.shape if hasattr(embeddings, 'shape') else 'Unknown'}")
+        # Load FAISS index and check compatibility
+        import faiss
+        index = faiss.read_index("embeddings/faiss_index.index")
+        # Print FAISS index stats
+        print(f"FAISS index contains {index.ntotal} vectors of dimension {index.d}")
+        # Check if the dimensionality matches
+        if hasattr(embeddings, 'shape'):
+            if len(embeddings.shape) != 2:
+                print(f"Warning: embeddings should be a 2D array, got shape {embeddings.shape}")
+                return False
+            if embeddings.shape[1] != index.d:
+                print(f"Error: Dimension mismatch - embeddings: {embeddings.shape[1]}, FAISS index: {index.d}")
+                return False
+            # Check if number of vectors matches
+            if embeddings.shape[0] != index.ntotal:
+                print(f"Warning: Count mismatch - embeddings: {embeddings.shape[0]}, FAISS index: {index.ntotal}")
+                print("This might be acceptable if the index was created from a subset of embeddings")
+        # Test a simple query to ensure the index works
+        try:
+            test_query = np.zeros((1, index.d), dtype=np.float32)
+            D, I = index.search(test_query, 1)
+            print("FAISS index test query successful")
+            return True
+        except Exception as e:
+            print(f"FAISS index test query failed: {e}")
+            traceback.print_exc()
+            return False
+    except Exception as e:
+        print(f"Compatibility verification failed: {e}")
+        traceback.print_exc()
+        return False
 def create_fallback_data():
     """Create minimal fallback data if downloads fail."""
     print("Creating fallback data files...")
     # Create minimal embeddings
     try:
         print("Creating fallback embeddings...")
+        # Create a small random matrix as embeddings (10 documents, 1536 dimensions - OpenAI dimension)
+        dimension = 1536  # text-embedding-3-small dimension
+        embeddings = np.random.random((10, dimension)).astype(np.float32)
         with open("embeddings/embeddings.pkl", "wb") as f:
             pickle.dump(embeddings, f)
+        # Create a minimal FAISS index with same dimension
         import faiss
         index = faiss.IndexFlatL2(dimension)
         index.add(embeddings)
         faiss.write_index(index, "embeddings/faiss_index.index")
         print("Fallback embeddings and FAISS index created successfully!")
     except Exception as e:
         print(f"Error creating fallback embeddings: {e}")
+        traceback.print_exc()
         return False
     # Create minimal document chunks
         print("Fallback document chunks created successfully!")
     except Exception as e:
         print(f"Error creating fallback document chunks: {e}")
+        traceback.print_exc()
         return False
+    # Verify compatibility
+    return verify_embeddings_faiss_compatibility()
 def download_datasets():
     """Download datasets from Hugging Face Hub."""
     print("Downloading data files from Hugging Face Hub...")
     download_success = True
+    # Track what we've downloaded
+    faiss_downloaded = False
+    embeddings_downloaded = False
+    chunks_downloaded = False
+    # Try multiple download methods
+    # Download FAISS index first
     try:
+        print("\nDownloading FAISS index...")
         # Try direct file download
         try:
             faiss_path = hf_hub_download(
                 filename="faiss_index.index",
                 repo_type="dataset"
             )
+            # Copy to correct location with error handling
+            if os.path.exists(faiss_path):
+                shutil.copy(faiss_path, "embeddings/faiss_index.index")
+                print(f"FAISS index downloaded and saved successfully to embeddings/faiss_index.index!")
+                print(f"FAISS index size: {os.path.getsize('embeddings/faiss_index.index') / (1024*1024):.2f} MB")
+                faiss_downloaded = True
+            else:
+                print(f"Downloaded FAISS path {faiss_path} does not exist")
         except Exception as e:
             print(f"Direct download of FAISS index failed: {e}")
             # Try alternate approach using dataset API
                     import faiss
                     faiss.write_index(embeddings_ds.faiss_index, "embeddings/faiss_index.index")
                     print("FAISS index from dataset attributes saved successfully!")
+                    faiss_downloaded = True
                 else:
+                    print("No FAISS index found in dataset attributes")
             except Exception as inner_e:
                 print(f"Alternative FAISS index download failed: {inner_e}")
     except Exception as e:
         print(f"Error downloading FAISS index: {e}")
+        traceback.print_exc()
+        download_success = False
+    # Download embeddings
+    try:
+        print("\nDownloading embeddings...")
+        # First check what files are available in the dataset repository
+        try:
+            files = list_repo_files("vichudo/agentic-defensor-embeddings", repo_type="dataset")
+            print(f"Files in embeddings repository: {files}")
+            # Try downloading directly if .pkl file is found
+            for file in files:
+                if file.endswith("embeddings.pkl") or file.endswith("embeddings.pt") or file.endswith("embeddings.npy"):
+                    print(f"Found embeddings file: {file}")
+                    try:
+                        emb_path = hf_hub_download(
+                            repo_id="vichudo/agentic-defensor-embeddings",
+                            filename=file,
+                            repo_type="dataset"
+                        )
+                        # Copy to correct location
+                        shutil.copy(emb_path, "embeddings/embeddings.pkl")
+                        print(f"Embeddings downloaded directly from file {file} and saved successfully!")
+                        embeddings_downloaded = True
+                        break
+                    except Exception as file_e:
+                        print(f"Direct embeddings file download failed: {file_e}")
+        except Exception as e:
+            print(f"Error listing files in embeddings repository: {e}")
+        # If direct file download failed, try using the dataset API
+        if not embeddings_downloaded:
+            try:
+                from datasets import load_dataset
+                import pandas as pd
+                # Try to download the dataset
+                embeddings_ds = load_dataset("vichudo/agentic-defensor-embeddings", split="train")
+                print(f"Embeddings dataset info: {embeddings_ds}")
+                print(f"Embeddings dataset features: {embeddings_ds.features}")
+                # Check first row to understand structure
+                if len(embeddings_ds) > 0:
+                    print(f"First row keys: {embeddings_ds[0].keys()}")
+                # Approach 1: Try to find data blob
+                if "data" in embeddings_ds[0]:
+                    print("Found 'data' blob in dataset")
+                    embeddings_data = pickle.loads(embeddings_ds[0]["data"])
+                    with open("embeddings/embeddings.pkl", "wb") as f:
+                        pickle.dump(embeddings_data, f)
+                    print("Embeddings from data blob saved successfully!")
+                    embeddings_downloaded = True
+                # Approach 2: Try to find embedding column
+                elif "embedding" in embeddings_ds[0]:
+                    print("Found 'embedding' column in dataset")
+                    # Convert dataset to pandas to handle embedding extraction
+                    df = pd.DataFrame(embeddings_ds)
+                    embeddings_array = np.stack(df.embedding.values)
+                    with open("embeddings/embeddings.pkl", "wb") as f:
+                        pickle.dump(embeddings_array, f)
+                    print("Embeddings from column data saved successfully!")
+                    embeddings_downloaded = True
+                # Approach 3: Try to work with parquet files directly
+                else:
+                    try:
+                        print("Trying to work with parquet files directly")
+                        import pyarrow.parquet as pq
+                        # Find all parquet files in the repository
+                        parquet_files = [f for f in files if f.endswith('.parquet')]
+                        if parquet_files:
+                            print(f"Found parquet files: {parquet_files}")
+                            for parquet_file in parquet_files:
+                                try:
+                                    parquet_path = hf_hub_download(
+                                        repo_id="vichudo/agentic-defensor-embeddings",
+                                        filename=parquet_file,
+                                        repo_type="dataset"
+                                    )
+                                    # Try to read parquet and extract embeddings
+                                    table = pq.read_table(parquet_path)
+                                    df = table.to_pandas()
+                                    print(f"Parquet columns: {df.columns}")
+                                    if "embedding" in df.columns:
+                                        print("Found 'embedding' column in parquet file")
+                                        embeddings_array = np.stack(df.embedding.values)
+                                        with open("embeddings/embeddings.pkl", "wb") as f:
+                                            pickle.dump(embeddings_array, f)
+                                        print("Embeddings from parquet file saved successfully!")
+                                        embeddings_downloaded = True
+                                        break
+                                    elif "data" in df.columns:
+                                        print("Found 'data' column in parquet file")
+                                        embeddings_data = pickle.loads(df.data.iloc[0])
+                                        with open("embeddings/embeddings.pkl", "wb") as f:
+                                            pickle.dump(embeddings_data, f)
+                                        print("Embeddings data from parquet file saved successfully!")
+                                        embeddings_downloaded = True
+                                        break
+                                except Exception as parquet_e:
+                                    print(f"Error processing parquet file {parquet_file}: {parquet_e}")
+                    except Exception as parquet_approach_e:
+                        print(f"Error in parquet approach: {parquet_approach_e}")
+            except Exception as ds_e:
+                print(f"Error processing embeddings dataset: {ds_e}")
+                traceback.print_exc()
+    except Exception as e:
+        print(f"Error downloading embeddings: {e}")
+        traceback.print_exc()
         download_success = False
     # Download document chunks
     try:
+        print("\nDownloading document chunks...")
         # First check what files are available
         try:
             files = list_repo_files("vichudo/agentic-defensor-chunks", repo_type="dataset")
             print(f"Files in chunks repository: {files}")
+            # Try direct file download if .pkl file exists
+            for file in files:
+                if file.endswith("doc_chunks.pkl") or file.endswith("chunks.pkl"):
+                    print(f"Found chunks file: {file}")
+                    try:
+                        chunks_path = hf_hub_download(
+                            repo_id="vichudo/agentic-defensor-chunks",
+                            filename=file,
+                            repo_type="dataset"
+                        )
+                        # Copy to correct location
+                        shutil.copy(chunks_path, "data/doc_chunks.pkl")
+                        print(f"Document chunks downloaded directly from file {file} and saved successfully!")
+                        chunks_downloaded = True
+                        break
+                    except Exception as file_e:
+                        print(f"Direct chunks file download failed: {file_e}")
         except Exception as e:
             print(f"Error listing files in chunks repository: {e}")
+        # If direct file approach failed, try dataset API
+        if not chunks_downloaded:
+            try:
+                from datasets import load_dataset
+                import pandas as pd
+                chunks_ds = load_dataset("vichudo/agentic-defensor-chunks", split="train")
+                print(f"Chunks dataset info: {chunks_ds}")
+                print(f"Chunks dataset features: {chunks_ds.features}")
+                if len(chunks_ds) > 0:
+                    print(f"First row keys: {chunks_ds[0].keys()}")
+                # Approach 1: Try to find data blob
+                if "data" in chunks_ds[0]:
+                    print("Found 'data' blob in chunks dataset")
+                    chunks_data = pickle.loads(chunks_ds[0]["data"])
+                    with open("data/doc_chunks.pkl", "wb") as f:
+                        pickle.dump(chunks_data, f)
+                    print("Document chunks from data blob saved successfully!")
+                    chunks_downloaded = True
+                # Approach 2: Try to reconstruct from text columns
+                elif all(field in chunks_ds[0] for field in ["text", "source"]):
+                    print("Found text and source columns, reconstructing chunks")
+                    df = pd.DataFrame(chunks_ds)
+                    chunks_list = []
+                    for _, row in df.iterrows():
+                        chunk = {
+                            "text": row["text"],
+                            "source": row["source"]
+                        }
+                        # Add other fields if available
+                        for field in ["page", "chunk_id", "metadata"]:
+                            if field in row:
+                                chunk[field] = row[field]
+                        chunks_list.append(chunk)
+                    with open("data/doc_chunks.pkl", "wb") as f:
+                        pickle.dump(chunks_list, f)
+                    print(f"Reconstructed {len(chunks_list)} document chunks successfully!")
+                    chunks_downloaded = True
+                # Approach 3: Try to work with parquet files directly
+                else:
+                    try:
+                        print("Trying to work with parquet files directly for chunks")
+                        import pyarrow.parquet as pq
+                        # Find all parquet files in the repository
+                        parquet_files = [f for f in files if f.endswith('.parquet')]
+                        if parquet_files:
+                            print(f"Found parquet files: {parquet_files}")
+                            for parquet_file in parquet_files:
+                                try:
+                                    parquet_path = hf_hub_download(
+                                        repo_id="vichudo/agentic-defensor-chunks",
+                                        filename=parquet_file,
+                                        repo_type="dataset"
+                                    )
+                                    # Try to read parquet and extract chunks
+                                    table = pq.read_table(parquet_path)
+                                    df = table.to_pandas()
+                                    print(f"Parquet columns: {df.columns}")
+                                    if "data" in df.columns:
+                                        print("Found 'data' column in chunks parquet file")
+                                        chunks_data = pickle.loads(df.data.iloc[0])
+                                        with open("data/doc_chunks.pkl", "wb") as f:
+                                            pickle.dump(chunks_data, f)
+                                        print("Chunks data from parquet file saved successfully!")
+                                        chunks_downloaded = True
+                                        break
+                                    elif all(field in df.columns for field in ["text", "source"]):
+                                        print("Found text and source columns in parquet, reconstructing")
+                                        chunks_list = []
+                                        for _, row in df.iterrows():
+                                            chunk = {
+                                                "text": row["text"],
+                                                "source": row["source"]
+                                            }
+                                            # Add other fields if available
+                                            for field in ["page", "chunk_id", "metadata"]:
+                                                if field in row:
+                                                    chunk[field] = row[field]
+                                            chunks_list.append(chunk)
+                                        with open("data/doc_chunks.pkl", "wb") as f:
+                                            pickle.dump(chunks_list, f)
+                                        print(f"Reconstructed {len(chunks_list)} document chunks from parquet successfully!")
+                                        chunks_downloaded = True
+                                        break
+                                except Exception as parquet_e:
+                                    print(f"Error processing chunks parquet file {parquet_file}: {parquet_e}")
+                    except Exception as parquet_approach_e:
+                        print(f"Error in chunks parquet approach: {parquet_approach_e}")
+            except Exception as ds_e:
+                print(f"Error processing chunks dataset: {ds_e}")
+                traceback.print_exc()
     except Exception as e:
         print(f"Error downloading document chunks: {e}")
+        traceback.print_exc()
         download_success = False
+    # Check what was successfully downloaded
+    print("\nDownload summary:")
+    print(f"- FAISS index: {'✓' if faiss_downloaded else '✗'}")
+    print(f"- Embeddings: {'✓' if embeddings_downloaded else '✗'}")
+    print(f"- Document chunks: {'✓' if chunks_downloaded else '✗'}")
+    download_success = faiss_downloaded and embeddings_downloaded and chunks_downloaded
+    # If downloads were successful, verify compatibility
+    if download_success:
+        compatible = verify_embeddings_faiss_compatibility()
+        if not compatible:
+            print("Warning: Downloaded files are not compatible, will use fallback data")
+            download_success = False
     return download_success
 if __name__ == "__main__":
     # If download fails, create fallback data
     if not success:
+        print("\n\nDownloads failed or data is incompatible. Creating fallback data...")
         success = create_fallback_data()
     if success:
+        # Just to be extra sure, load everything to verify
+        try:
+            import faiss
+            index = faiss.read_index("embeddings/faiss_index.index")
+            with open("embeddings/embeddings.pkl", "rb") as f:
+                embeddings = pickle.load(f)
+            with open("data/doc_chunks.pkl", "rb") as f:
+                chunks = pickle.load(f)
+            print("\nFinal verification:")
+            print(f"FAISS index: {index.ntotal} vectors of dimension {index.d}")
+            if hasattr(embeddings, 'shape'):
+                print(f"Embeddings: shape {embeddings.shape}")
+            else:
+                print(f"Embeddings: type {type(embeddings)}")
+            print(f"Document chunks: {len(chunks)} chunks")
+            print("\nData files setup completed successfully!")
+            sys.exit(0)
+        except Exception as e:
+            print(f"\nFinal verification failed: {e}")
+            traceback.print_exc()
+            sys.exit(1)
     else:
+        print("\nFailed to set up data files.")
         sys.exit(1)

requirements.txt CHANGED Viewed

@@ -9,4 +9,8 @@ numpy>=1.24.0
 scikit-learn>=1.3.0
 pandas>=2.0.0
 torch>=2.0.0
-langchain>=0.0.335

 scikit-learn>=1.3.0
 pandas>=2.0.0
 torch>=2.0.0
+langchain>=0.0.335
+pyarrow>=14.0.1
+datasets>=2.15.0
+huggingface_hub>=0.19.0
+requests>=2.31.0

src/embeddings/embedder.py CHANGED Viewed

@@ -20,7 +20,31 @@ class TextEmbedder:
         self.model = model
         self.batch_size = batch_size
         self.client = OpenAI(api_key=OPENAI_API_KEY)
-        self.embedding_dim = 1536  # Default dimension for text-embedding-3-small
     def get_embedding_for_text(self, text: str) -> List[float]:
         """Generate embedding for a single text."""
@@ -80,7 +104,28 @@ class TextEmbedder:
                 input=[query],
                 model=self.model
             )
-            return np.array(q_response.data[0].embedding, dtype='float32').reshape(1, -1)
         except Exception as e:
             print(f"Error creating embedding for query: {e}")
             return np.zeros((1, self.embedding_dim), dtype='float32')

         self.model = model
         self.batch_size = batch_size
         self.client = OpenAI(api_key=OPENAI_API_KEY)
+        # Default dimension for different models
+        self.embedding_dim = self._get_model_dimension(model)
+        print(f"Initialized TextEmbedder with model {model}, dimension {self.embedding_dim}")
+    def _get_model_dimension(self, model_name: str) -> int:
+        """Get the embedding dimension for a given model."""
+        # Mapping of model names to dimensions
+        dimensions = {
+            "text-embedding-3-small": 1536,
+            "text-embedding-3-large": 3072,
+            "text-embedding-ada-002": 1536,
+            # Add other models if needed
+        }
+        # Return the dimension for the model or default to 1536 (most common)
+        return dimensions.get(model_name, 1536)
+    def set_dimension(self, dimension: int) -> None:
+        """
+        Set the embedding dimension explicitly.
+        Use this to ensure compatibility with existing FAISS indices.
+        """
+        self.embedding_dim = dimension
+        print(f"Explicitly set embedding dimension to {dimension}")
     def get_embedding_for_text(self, text: str) -> List[float]:
         """Generate embedding for a single text."""
                 input=[query],
                 model=self.model
             )
+            embedding = np.array(q_response.data[0].embedding, dtype='float32')
+            # Check and log the actual dimension
+            actual_dim = embedding.shape[0]
+            if actual_dim != self.embedding_dim:
+                print(f"Warning: OpenAI returned embedding of dimension {actual_dim}, expected {self.embedding_dim}")
+                # Handle dimension mismatch
+                if actual_dim > self.embedding_dim:
+                    # Truncate the embedding to match expected dimension
+                    print(f"Truncating embedding from {actual_dim} to {self.embedding_dim}")
+                    embedding = embedding[:self.embedding_dim]
+                elif actual_dim < self.embedding_dim:
+                    # Pad the embedding to match expected dimension
+                    print(f"Padding embedding from {actual_dim} to {self.embedding_dim}")
+                    padding = np.zeros(self.embedding_dim - actual_dim, dtype='float32')
+                    embedding = np.concatenate([embedding, padding])
+            # Return the embedding as a 2D array
+            return embedding.reshape(1, -1)
         except Exception as e:
             print(f"Error creating embedding for query: {e}")
+            import traceback
+            traceback.print_exc()
             return np.zeros((1, self.embedding_dim), dtype='float32')

src/models/retriever.py CHANGED Viewed

@@ -89,10 +89,27 @@ class Retriever:
             resource_manager.faiss_index = self.index
             resource_manager.doc_chunks = self.doc_chunks
             resource_manager.initialized = True
         except Exception as e:
             print(f"Error loading resources: {e}")
             raise
     def retrieve(self, query: str, top_k: Optional[int] = None) -> List[Dict[str, Any]]:
         """
         Retrieve the most relevant document chunks for a query.
@@ -117,9 +134,35 @@ class Retriever:
         # Search the FAISS index
         try:
             distances, indices = self.index.search(query_embedding, top_k)
         except Exception as e:
             print(f"Error during FAISS search: {e}")
             # Return all available chunks as fallback
             return self._get_all_chunks_with_placeholder_scores()

             resource_manager.faiss_index = self.index
             resource_manager.doc_chunks = self.doc_chunks
             resource_manager.initialized = True
+            # Ensure embedder dimension matches FAISS index
+            self._ensure_embedder_compatibility()
         except Exception as e:
             print(f"Error loading resources: {e}")
+            import traceback
+            traceback.print_exc()
             raise
+    def _ensure_embedder_compatibility(self) -> None:
+        """Ensure the embedder's dimension matches the FAISS index dimension."""
+        if self.index is not None and hasattr(self.embedder, 'set_dimension'):
+            faiss_dim = self.index.d
+            embedder_dim = self.embedder.embedding_dim
+            if faiss_dim != embedder_dim:
+                print(f"Warning: Dimension mismatch between FAISS index ({faiss_dim}) and embedder ({embedder_dim})")
+                print(f"Adjusting embedder dimension to match FAISS index")
+                self.embedder.set_dimension(faiss_dim)
     def retrieve(self, query: str, top_k: Optional[int] = None) -> List[Dict[str, Any]]:
         """
         Retrieve the most relevant document chunks for a query.
         # Search the FAISS index
         try:
+            print(f"FAISS index info - ntotal: {self.index.ntotal}, dimension: {self.index.d}")
+            print(f"Query embedding shape: {query_embedding.shape}")
             distances, indices = self.index.search(query_embedding, top_k)
+            # Log first few results for debugging
+            top_indices = indices[0][:min(3, len(indices[0]))]
+            top_distances = distances[0][:min(3, len(distances[0]))]
+            print(f"Top 3 results - indices: {top_indices}, distances: {top_distances}")
         except Exception as e:
             print(f"Error during FAISS search: {e}")
+            import traceback
+            traceback.print_exc()
+            # Provide diagnostic information
+            try:
+                # Check if embeddings and index are compatible
+                if self.index is None:
+                    print("FAISS index is None - index was not loaded properly")
+                else:
+                    print(f"FAISS index dimension: {self.index.d}, total vectors: {self.index.ntotal}")
+                if query_embedding is None:
+                    print("Query embedding is None")
+                else:
+                    print(f"Query embedding shape: {query_embedding.shape}, dtype: {query_embedding.dtype}")
+                    if query_embedding.shape[1] != self.index.d:
+                        print(f"Dimension mismatch: query embedding ({query_embedding.shape[1]}) vs. FAISS index ({self.index.d})")
+            except Exception as diagnostic_e:
+                print(f"Error during diagnostics: {diagnostic_e}")
             # Return all available chunks as fallback
             return self._get_all_chunks_with_placeholder_scores()