Spaces:

Asish22
/

code-crawler

Running

App Files Files Community

juliaturc commited on Aug 30, 2024

Commit

5f9eeb4

1 Parent(s): 57007fe

Clean up the structure of the code.

Browse files

Files changed (3) hide show

src/chat.py +3 -32
src/index.py +2 -7
src/vector_store.py +56 -1

src/chat.py CHANGED Viewed

@@ -4,20 +4,17 @@ You must run main.py first in order to index the codebase into a vector store.
 """
 import argparse
-from typing import List
 import gradio as gr
-import marqo
 from dotenv import load_dotenv
 from langchain.chains import (create_history_aware_retriever,
                               create_retrieval_chain)
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.schema import AIMessage, HumanMessage
-from langchain_community.vectorstores import Marqo, Pinecone
-from langchain_core.documents import Document
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from repo_manager import RepoManager
 load_dotenv()
@@ -26,33 +23,7 @@ load_dotenv()
 def build_rag_chain(args):
     """Builds a RAG chain via LangChain."""
     llm = ChatOpenAI(model=args.openai_model)
-    if args.vector_store_type == "pinecone":
-        vectorstore = Pinecone.from_existing_index(
-            index_name=args.pinecone_index_name,
-            embedding=OpenAIEmbeddings(),
-            namespace=args.repo_id,
-        )
-    elif args.vector_store_type == "marqo":
-        marqo_client = marqo.Client(url=args.marqo_url)
-        vectorstore = Marqo(
-            client=marqo_client,
-            index_name=args.index_name,
-        )
-    # Monkey-patch the _construct_documents_from_results_without_score method to not expect a "metadata" field in the
-    # result, and instead take the "filename" directly from the result.
-    def patched_method(self, results):
-        documents: List[Document] = []
-        for res in results["hits"]:
-            documents.append(Document(page_content=res["text"], metadata={"filename": res["filename"]}))
-        return documents
-    vectorstore._construct_documents_from_results_without_score = patched_method.__get__(
-        vectorstore, vectorstore.__class__
-    )
-    retriever = vectorstore.as_retriever()
     # Prompt to contextualize the latest query based on the chat history.
     contextualize_q_system_prompt = (

 """
 import argparse
 import gradio as gr
 from dotenv import load_dotenv
 from langchain.chains import (create_history_aware_retriever,
                               create_retrieval_chain)
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.schema import AIMessage, HumanMessage
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_openai import ChatOpenAI
+import vector_store
 from repo_manager import RepoManager
 load_dotenv()
 def build_rag_chain(args):
     """Builds a RAG chain via LangChain."""
     llm = ChatOpenAI(model=args.openai_model)
+    retriever = vector_store.build_from_args(args).to_langchain().as_retriever()
     # Prompt to contextualize the latest query based on the chat history.
     contextualize_q_system_prompt = (

src/index.py CHANGED Viewed

@@ -7,11 +7,10 @@ import time
 from chunker import UniversalChunker
 from embedder import MarqoEmbedder, OpenAIBatchEmbedder
 from repo_manager import RepoManager
-from vector_store import PineconeVectorStore
 logging.basicConfig(level=logging.INFO)
-OPENAI_EMBEDDING_SIZE = 1536
 MAX_TOKENS_PER_CHUNK = 8192  # The ADA embedder from OpenAI has a maximum of 8192 tokens.
 MAX_CHUNKS_PER_BATCH = 2048  # The OpenAI batch embedding API enforces a maximum of 2048 chunks per batch.
 MAX_TOKENS_PER_JOB = 3_000_000  # The OpenAI batch embedding API enforces a maximum of 3M tokens processed at once.
@@ -129,11 +128,7 @@ def main():
     logging.info("Moving embeddings to the vector store...")
     # Note to developer: Replace this with your preferred vector store.
-    vector_store = PineconeVectorStore(
-        index_name=args.index_name,
-        dimension=OPENAI_EMBEDDING_SIZE,
-        namespace=repo_manager.repo_id,
-    )
     vector_store.ensure_exists()
     vector_store.upsert(embedder.download_embeddings())
     logging.info("Done!")

 from chunker import UniversalChunker
 from embedder import MarqoEmbedder, OpenAIBatchEmbedder
 from repo_manager import RepoManager
+from vector_store import build_from_args
 logging.basicConfig(level=logging.INFO)
 MAX_TOKENS_PER_CHUNK = 8192  # The ADA embedder from OpenAI has a maximum of 8192 tokens.
 MAX_CHUNKS_PER_BATCH = 2048  # The OpenAI batch embedding API enforces a maximum of 2048 chunks per batch.
 MAX_TOKENS_PER_JOB = 3_000_000  # The OpenAI batch embedding API enforces a maximum of 3M tokens processed at once.
     logging.info("Moving embeddings to the vector store...")
     # Note to developer: Replace this with your preferred vector store.
+    vector_store = build_from_args(args)
     vector_store.ensure_exists()
     vector_store.upsert(embedder.download_embeddings())
     logging.info("Done!")

src/vector_store.py CHANGED Viewed

@@ -3,8 +3,13 @@
 from abc import ABC, abstractmethod
 from typing import Dict, Generator, List, Tuple
 from pinecone import Pinecone
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
@@ -30,11 +35,15 @@ class VectorStore(ABC):
         if batch:
             self.upsert_batch(batch)
 class PineconeVectorStore(VectorStore):
     """Vector store implementation using Pinecone."""
-    def __init__(self, index_name: str, dimension: int, namespace: str):
         self.index_name = index_name
         self.dimension = dimension
         self.client = Pinecone()
@@ -50,3 +59,49 @@ class PineconeVectorStore(VectorStore):
             (metadata.get("id", str(i)), embedding, metadata) for i, (metadata, embedding) in enumerate(vectors)
         ]
         self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)

 from abc import ABC, abstractmethod
 from typing import Dict, Generator, List, Tuple
+import marqo
+from langchain_community.vectorstores import Marqo
+from langchain_core.documents import Document
+from langchain_openai import OpenAIEmbeddings
 from pinecone import Pinecone
+OPENAI_EMBEDDING_SIZE = 1536
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
         if batch:
             self.upsert_batch(batch)
+    @abstractmethod
+    def to_langchain(self):
+        """Converts the vector store to a LangChain vector store object."""
 class PineconeVectorStore(VectorStore):
     """Vector store implementation using Pinecone."""
+    def __init__(self, index_name: str, namespace: str, dimension: int = OPENAI_EMBEDDING_SIZE):
         self.index_name = index_name
         self.dimension = dimension
         self.client = Pinecone()
             (metadata.get("id", str(i)), embedding, metadata) for i, (metadata, embedding) in enumerate(vectors)
         ]
         self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)
+    def to_langchain(self):
+        return Pinecone.from_existing_index(
+            index_name=self.index_name, embedding=OpenAIEmbeddings(), namespace=self.namespace
+        )
+class MarqoVectorStore(VectorStore):
+    """Vector store implementation using Marqo."""
+    def __init__(self, url: str, index_name: str):
+        self.client = marqo.Client(url=url)
+        self.index_name = index_name
+    def ensure_exists(self):
+        pass
+    def upsert_batch(self, vectors: List[Vector]):
+        # Since Marqo is both an embedder and a vector store, the embedder is already doing the upsert.
+        pass
+    def to_langchain(self):
+        vectorstore = Marqo(client=self.client, index_name=self.index_name)
+        # Monkey-patch the _construct_documents_from_results_without_score method to not expect a "metadata" field in
+        # the result, and instead take the "filename" directly from the result.
+        def patched_method(self, results):
+            documents: List[Document] = []
+            for res in results["hits"]:
+                documents.append(Document(page_content=res["text"], metadata={"filename": res["filename"]}))
+            return documents
+        vectorstore._construct_documents_from_results_without_score = patched_method.__get__(
+            vectorstore, vectorstore.__class__
+        )
+        return vectorstore
+def build_from_args(args: dict) -> VectorStore:
+    """Builds a vector store from the given command-line arguments."""
+    if args.vector_store_type == "pinecone":
+        return PineconeVectorStore(index_name=args.index_name, namespace=args.repo_id)
+    elif args.vector_store_type == "marqo":
+        return MarqoVectorStore(url=args.marqo_url, index_name=args.index_name)
+    else:
+        raise ValueError(f"Unrecognized vector store type {args.vector_store_type}")