Spaces:

Asish22
/

code-crawler

Sleeping

App Files Files Community

juliaturc commited on Sep 18, 2024

Commit

c3e6715

1 Parent(s): 82df3b5

Add option for Pinecone + BM25 hybrid retrieval. (#36)

Browse files

Files changed (8) hide show

README.md +1 -1
sage/chat.py +9 -9
sage/chunker.py +3 -1
sage/constants.py +3 -0
sage/embedder.py +4 -3
sage/github.py +2 -1
sage/index.py +7 -0
sage/vector_store.py +65 -15

README.md CHANGED Viewed

@@ -80,7 +80,7 @@ pip install git+https://github.com/Storia-AI/sage.git@main
     export PINECONE_API_KEY=...
     ```
-2. Create a Pinecone index [on their website](https://pinecone.io) and export the name:
     ```
     export PINECONE_INDEX_NAME=...
     ```

     export PINECONE_API_KEY=...
     ```
+2. Create a Pinecone account. Export the desired index name (if it doesn't exist yet, we'll create it):
     ```
     export PINECONE_INDEX_NAME=...
     ```

sage/chat.py CHANGED Viewed

@@ -28,7 +28,8 @@ def build_rag_chain(args):
     """Builds a RAG chain via LangChain."""
     llm = build_llm_via_langchain(args.llm_provider, args.llm_model)
-    retriever = vector_store.build_from_args(args).to_langchain().as_retriever(search_kwargs={"k": 25})
     if args.reranker_provider == "none":
         compressor = None
@@ -78,14 +79,6 @@ def build_rag_chain(args):
     return rag_chain
-def append_sources_to_response(response):
-    """Given an OpenAI completion response, appends to it GitHub links of the context sources."""
-    urls = [document.metadata["url"] for document in response["context"]]
-    # Deduplicate urls while preserving their order.
-    urls = list(dict.fromkeys(urls))
-    return response["answer"] + "\n\nSources:\n" + "\n".join(urls)
 def main():
     parser = argparse.ArgumentParser(description="UI to chat with your codebase")
     parser.add_argument("repo_id", help="The ID of the repository to index")
@@ -112,6 +105,13 @@ def main():
         default=False,
         help="Whether to make the gradio app publicly accessible.",
     )
     args = parser.parse_args()
     if not args.index_name:

     """Builds a RAG chain via LangChain."""
     llm = build_llm_via_langchain(args.llm_provider, args.llm_model)
+    retriever_top_k = 5 if args.reranker_provider == "none" else 25
+    retriever = vector_store.build_from_args(args).as_retriever(top_k=retriever_top_k)
     if args.reranker_provider == "none":
         compressor = None
     return rag_chain
 def main():
     parser = argparse.ArgumentParser(description="UI to chat with your codebase")
     parser.add_argument("repo_id", help="The ID of the repository to index")
         default=False,
         help="Whether to make the gradio app publicly accessible.",
     )
+    parser.add_argument(
+        "--hybrid-retrieval",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Whether to use a hybrid of vector DB + BM25 retrieval. When set to False, we only use vector DB "
+        "retrieval. This is only relevant if using Pinecone as the vector store.",
+    )
     args = parser.parse_args()
     if not args.index_name:

sage/chunker.py CHANGED Viewed

@@ -14,6 +14,8 @@ from semchunk import chunk as chunk_via_semchunk
 from tree_sitter import Node
 from tree_sitter_language_pack import get_parser
 logger = logging.getLogger(__name__)
 tokenizer = tiktoken.get_encoding("cl100k_base")
@@ -62,7 +64,7 @@ class FileChunk(Chunk):
             # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
             # size limit. In that case, you can simply store the start/end bytes above, and fetch the content
             # directly from the repository when needed.
-            "text": self.content,
         }
         chunk_metadata.update(self.file_metadata)
         return chunk_metadata

 from tree_sitter import Node
 from tree_sitter_language_pack import get_parser
+from sage.constants import TEXT_FIELD
 logger = logging.getLogger(__name__)
 tokenizer = tiktoken.get_encoding("cl100k_base")
             # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
             # size limit. In that case, you can simply store the start/end bytes above, and fetch the content
             # directly from the repository when needed.
+            TEXT_FIELD: self.content,
         }
         chunk_metadata.update(self.file_metadata)
         return chunk_metadata

sage/constants.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# This is the key in the metadata that points to the actual text content of a document or chunk.
+# It can mostly be an arbitrary string, but certain classes in LangChain do expect it to be "text" specifically.
+TEXT_FIELD = "text"

sage/embedder.py CHANGED Viewed

@@ -12,6 +12,7 @@ import marqo
 from openai import OpenAI
 from sage.chunker import Chunk, Chunker
 from sage.data_manager import DataManager
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
@@ -139,7 +140,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
                     and "start_byte" in metadata
                     and "end_byte" in metadata
                 ):
-                    metadata.pop("text", None)
                 embedding = datum["embedding"]
                 yield (metadata, embedding)
@@ -240,7 +241,7 @@ class MarqoEmbedder(BatchEmbedder):
                     logging.info("Indexing %d chunks...", len(sub_batch))
                     self.index.add_documents(
                         documents=[chunk.metadata for chunk in sub_batch],
-                        tensor_fields=["text"],
                     )
                     job_count += 1
@@ -251,7 +252,7 @@ class MarqoEmbedder(BatchEmbedder):
         # Finally, commit the last batch.
         if batch:
-            self.index.add_documents(documents=[chunk.metadata for chunk in batch], tensor_fields=["text"])
         logging.info(f"Successfully embedded {chunk_count} chunks.")
     def embeddings_are_ready(self) -> bool:

 from openai import OpenAI
 from sage.chunker import Chunk, Chunker
+from sage.constants import TEXT_FIELD
 from sage.data_manager import DataManager
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
                     and "start_byte" in metadata
                     and "end_byte" in metadata
                 ):
+                    metadata.pop(TEXT_FIELD, None)
                 embedding = datum["embedding"]
                 yield (metadata, embedding)
                     logging.info("Indexing %d chunks...", len(sub_batch))
                     self.index.add_documents(
                         documents=[chunk.metadata for chunk in sub_batch],
+                        tensor_fields=[TEXT_FIELD],
                     )
                     job_count += 1
         # Finally, commit the last batch.
         if batch:
+            self.index.add_documents(documents=[chunk.metadata for chunk in batch], tensor_fields=[TEXT_FIELD])
         logging.info(f"Successfully embedded {chunk_count} chunks.")
     def embeddings_are_ready(self) -> bool:

sage/github.py CHANGED Viewed

@@ -9,6 +9,7 @@ import requests
 import tiktoken
 from sage.chunker import Chunk, Chunker
 from sage.data_manager import DataManager
 tokenizer = tiktoken.get_encoding("cl100k_base")
@@ -179,7 +180,7 @@ class IssueChunk(Chunk):
             # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
             # size limit. In that case, you can simply store the start/end comment indices above, and fetch the
             # content of the issue on demand from the URL.
-            "text": self.content,
         }
     @property

 import tiktoken
 from sage.chunker import Chunk, Chunker
+from sage.constants import TEXT_FIELD
 from sage.data_manager import DataManager
 tokenizer = tiktoken.get_encoding("cl100k_base")
             # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
             # size limit. In that case, you can simply store the start/end comment indices above, and fetch the
             # content of the issue on demand from the URL.
+            TEXT_FIELD: self.content,
         }
     @property

sage/index.py CHANGED Viewed

@@ -118,6 +118,13 @@ def main():
         "GitHub's API for downloading comments is quite slow. Indexing solely the body of an issue seems to bring most "
         "of the gains anyway.",
     )
     args = parser.parse_args()
     # Validate embedder and vector store compatibility.

         "GitHub's API for downloading comments is quite slow. Indexing solely the body of an issue seems to bring most "
         "of the gains anyway.",
     )
+    parser.add_argument(
+        "--hybrid-retrieval",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Whether to use a hybrid of vector DB + BM25 retrieval. When set to False, we only use vector DB "
+        "retrieval. This is only relevant if using Pinecone as the vector store.",
+    )
     args = parser.parse_args()
     # Validate embedder and vector store compatibility.

sage/vector_store.py CHANGED Viewed

@@ -1,14 +1,19 @@
 """Vector store abstraction and implementations."""
 from abc import ABC, abstractmethod
 from typing import Dict, Generator, List, Tuple
 import marqo
 from langchain_community.vectorstores import Marqo
 from langchain_community.vectorstores import Pinecone as LangChainPinecone
 from langchain_core.documents import Document
 from langchain_openai import OpenAIEmbeddings
-from pinecone import Pinecone
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
@@ -36,34 +41,77 @@ class VectorStore(ABC):
             self.upsert_batch(batch)
     @abstractmethod
-    def to_langchain(self):
-        """Converts the vector store to a LangChain vector store object."""
 class PineconeVectorStore(VectorStore):
     """Vector store implementation using Pinecone."""
-    def __init__(self, index_name: str, namespace: str, dimension: int):
         self.index_name = index_name
         self.dimension = dimension
         self.client = Pinecone()
-        self.index = self.client.Index(self.index_name)
         self.namespace = namespace
     def ensure_exists(self):
         if self.index_name not in self.client.list_indexes().names():
-            self.client.create_index(name=self.index_name, dimension=self.dimension, metric="cosine")
     def upsert_batch(self, vectors: List[Vector]):
-        pinecone_vectors = [
-            (metadata.get("id", str(i)), embedding, metadata) for i, (metadata, embedding) in enumerate(vectors)
-        ]
         self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)
-    def to_langchain(self):
         return LangChainPinecone.from_existing_index(
             index_name=self.index_name, embedding=OpenAIEmbeddings(), namespace=self.namespace
-        )
 class MarqoVectorStore(VectorStore):
@@ -80,7 +128,7 @@ class MarqoVectorStore(VectorStore):
         # Since Marqo is both an embedder and a vector store, the embedder is already doing the upsert.
         pass
-    def to_langchain(self):
         vectorstore = Marqo(client=self.client, index_name=self.index_name)
         # Monkey-patch the _construct_documents_from_results_without_score method to not expect a "metadata" field in
@@ -88,21 +136,23 @@ class MarqoVectorStore(VectorStore):
         def patched_method(self, results):
             documents: List[Document] = []
             for result in results["hits"]:
-                content = result.pop("text")
                 documents.append(Document(page_content=content, metadata=result))
             return documents
         vectorstore._construct_documents_from_results_without_score = patched_method.__get__(
             vectorstore, vectorstore.__class__
         )
-        return vectorstore
 def build_from_args(args: dict) -> VectorStore:
     """Builds a vector store from the given command-line arguments."""
     if args.vector_store_type == "pinecone":
         dimension = args.embedding_size if "embedding_size" in args else None
-        return PineconeVectorStore(index_name=args.index_name, namespace=args.repo_id, dimension=dimension)
     elif args.vector_store_type == "marqo":
         return MarqoVectorStore(url=args.marqo_url, index_name=args.index_name)
     else:

 """Vector store abstraction and implementations."""
 from abc import ABC, abstractmethod
+from functools import cached_property
 from typing import Dict, Generator, List, Tuple
 import marqo
+from langchain_community.retrievers import PineconeHybridSearchRetriever
 from langchain_community.vectorstores import Marqo
 from langchain_community.vectorstores import Pinecone as LangChainPinecone
 from langchain_core.documents import Document
 from langchain_openai import OpenAIEmbeddings
+from pinecone import Pinecone, ServerlessSpec
+from pinecone_text.sparse import BM25Encoder
+from sage.constants import TEXT_FIELD
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
             self.upsert_batch(batch)
     @abstractmethod
+    def as_retriever(self, top_k: int):
+        """Converts the vector store to a LangChain retriever object."""
 class PineconeVectorStore(VectorStore):
     """Vector store implementation using Pinecone."""
+    def __init__(self, index_name: str, namespace: str, dimension: int, hybrid: bool = True):
         self.index_name = index_name
         self.dimension = dimension
         self.client = Pinecone()
         self.namespace = namespace
+        self.hybrid = hybrid
+        # The default BM25 encoder was fit in the MS MARCO dataset.
+        # See https://docs.pinecone.io/guides/data/encode-sparse-vectors
+        # In the future, we should fit the encoder on the current dataset. It's somewhat non-trivial for large datasets,
+        # because most BM25 implementations require the entire dataset to fit in memory.
+        self.bm25_encoder = BM25Encoder.default() if hybrid else None
+    @cached_property
+    def index(self):
+        self.ensure_exists()
+        index = self.client.Index(self.index_name)
+        # Hack around the fact that PineconeRetriever expects the content of the chunk to be in a "text" field,
+        # while PineconeHybridSearchRetrieve expects it to be in a "context" field.
+        original_query = index.query
+        def patched_query(*args, **kwargs):
+            result = original_query(*args, **kwargs)
+            for res in result["matches"]:
+                res["metadata"]["context"] = res["metadata"][TEXT_FIELD]
+            return result
+        index.query = patched_query
+        return index
     def ensure_exists(self):
         if self.index_name not in self.client.list_indexes().names():
+            self.client.create_index(
+                name=self.index_name,
+                dimension=self.dimension,
+                # See https://www.pinecone.io/learn/hybrid-search-intro/
+                metric="dotproduct" if self.hybrid else "cosine",
+                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+            )
     def upsert_batch(self, vectors: List[Vector]):
+        pinecone_vectors = []
+        for i, (metadata, embedding) in enumerate(vectors):
+            vector = {"id": metadata.get("id", str(i)), "values": embedding, "metadata": metadata}
+            if self.bm25_encoder:
+                vector["sparse_values"] = self.bm25_encoder.encode_documents(metadata[TEXT_FIELD])
+            pinecone_vectors.append(vector)
         self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)
+    def as_retriever(self, top_k: int):
+        if self.bm25_encoder:
+            return PineconeHybridSearchRetriever(
+                embeddings=OpenAIEmbeddings(),
+                sparse_encoder=self.bm25_encoder,
+                index=self.index,
+                namespace=self.namespace,
+                top_k=top_k,
+                alpha=0.5,
+            )
         return LangChainPinecone.from_existing_index(
             index_name=self.index_name, embedding=OpenAIEmbeddings(), namespace=self.namespace
+        ).as_retriever(search_kwargs={"k": top_k})
 class MarqoVectorStore(VectorStore):
         # Since Marqo is both an embedder and a vector store, the embedder is already doing the upsert.
         pass
+    def as_retriever(self, top_k: int):
         vectorstore = Marqo(client=self.client, index_name=self.index_name)
         # Monkey-patch the _construct_documents_from_results_without_score method to not expect a "metadata" field in
         def patched_method(self, results):
             documents: List[Document] = []
             for result in results["hits"]:
+                content = result.pop(TEXT_FIELD)
                 documents.append(Document(page_content=content, metadata=result))
             return documents
         vectorstore._construct_documents_from_results_without_score = patched_method.__get__(
             vectorstore, vectorstore.__class__
         )
+        return vectorstore.as_retriever(search_kwargs={"k": top_k})
 def build_from_args(args: dict) -> VectorStore:
     """Builds a vector store from the given command-line arguments."""
     if args.vector_store_type == "pinecone":
         dimension = args.embedding_size if "embedding_size" in args else None
+        return PineconeVectorStore(
+            index_name=args.index_name, namespace=args.repo_id, dimension=dimension, hybrid=args.hybrid_retrieval
+        )
     elif args.vector_store_type == "marqo":
         return MarqoVectorStore(url=args.marqo_url, index_name=args.index_name)
     else: