Spaces:

Asish22
/

code-crawler

Running

App Files Files Community

juliaturc commited on Aug 30, 2024

Commit

57007fe

1 Parent(s): e8553c3

Support marqo on the inference side and format code.

Browse files

Files changed (6) hide show

src/chat.py +38 -18
src/chunker.py +9 -22
src/embedder.py +7 -15
src/index.py +18 -23
src/repo_manager.py +7 -21
src/vector_store.py +3 -5

src/chat.py CHANGED Viewed

@@ -4,14 +4,17 @@ You must run main.py first in order to index the codebase into a vector store.
 """
 import argparse
-from dotenv import load_dotenv
 import gradio as gr
-from langchain.chains import create_history_aware_retriever, create_retrieval_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.schema import AIMessage, HumanMessage
-from langchain_community.vectorstores import Pinecone
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
@@ -24,10 +27,29 @@ def build_rag_chain(args):
     """Builds a RAG chain via LangChain."""
     llm = ChatOpenAI(model=args.openai_model)
-    vectorstore = Pinecone.from_existing_index(
-        index_name=args.pinecone_index_name,
-        embedding=OpenAIEmbeddings(),
-        namespace=args.repo_id,
     )
     retriever = vectorstore.as_retriever()
@@ -45,9 +67,7 @@ def build_rag_chain(args):
             ("human", "{input}"),
         ]
     )
-    history_aware_retriever = create_history_aware_retriever(
-        llm, retriever, contextualize_q_prompt
-    )
     qa_system_prompt = (
         f"You are my coding buddy, helping me quickly understand a GitHub repository called {args.repo_id}."
@@ -76,9 +96,7 @@ def append_sources_to_response(response):
     # Deduplicate filenames while preserving their order.
     filenames = list(dict.fromkeys(filenames))
     repo_manager = RepoManager(args.repo_id)
-    github_links = [
-        repo_manager.github_link_for_file(filename) for filename in filenames
-    ]
     return response["answer"] + "\n\nSources:\n" + "\n".join(github_links)
@@ -90,8 +108,12 @@ if __name__ == "__main__":
         default="gpt-4",
         help="The OpenAI model to use for response generation",
     )
     parser.add_argument(
-        "--pinecone_index_name", required=True, help="Pinecone index name"
     )
     parser.add_argument(
         "--share",
@@ -109,9 +131,7 @@ if __name__ == "__main__":
             history_langchain_format.append(HumanMessage(content=human))
             history_langchain_format.append(AIMessage(content=ai))
         history_langchain_format.append(HumanMessage(content=message))
-        response = rag_chain.invoke(
-            {"input": message, "chat_history": history_langchain_format}
-        )
         answer = append_sources_to_response(response)
         return answer

 """
 import argparse
+from typing import List
 import gradio as gr
+import marqo
+from dotenv import load_dotenv
+from langchain.chains import (create_history_aware_retriever,
+                              create_retrieval_chain)
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.schema import AIMessage, HumanMessage
+from langchain_community.vectorstores import Marqo, Pinecone
+from langchain_core.documents import Document
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
     """Builds a RAG chain via LangChain."""
     llm = ChatOpenAI(model=args.openai_model)
+    if args.vector_store_type == "pinecone":
+        vectorstore = Pinecone.from_existing_index(
+            index_name=args.pinecone_index_name,
+            embedding=OpenAIEmbeddings(),
+            namespace=args.repo_id,
+        )
+    elif args.vector_store_type == "marqo":
+        marqo_client = marqo.Client(url=args.marqo_url)
+        vectorstore = Marqo(
+            client=marqo_client,
+            index_name=args.index_name,
+        )
+    # Monkey-patch the _construct_documents_from_results_without_score method to not expect a "metadata" field in the
+    # result, and instead take the "filename" directly from the result.
+    def patched_method(self, results):
+        documents: List[Document] = []
+        for res in results["hits"]:
+            documents.append(Document(page_content=res["text"], metadata={"filename": res["filename"]}))
+        return documents
+    vectorstore._construct_documents_from_results_without_score = patched_method.__get__(
+        vectorstore, vectorstore.__class__
     )
     retriever = vectorstore.as_retriever()
             ("human", "{input}"),
         ]
     )
+    history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
     qa_system_prompt = (
         f"You are my coding buddy, helping me quickly understand a GitHub repository called {args.repo_id}."
     # Deduplicate filenames while preserving their order.
     filenames = list(dict.fromkeys(filenames))
     repo_manager = RepoManager(args.repo_id)
+    github_links = [repo_manager.github_link_for_file(filename) for filename in filenames]
     return response["answer"] + "\n\nSources:\n" + "\n".join(github_links)
         default="gpt-4",
         help="The OpenAI model to use for response generation",
     )
+    parser.add_argument("--vector_store_type", default="pinecone", choices=["pinecone", "marqo"])
+    parser.add_argument("--index_name", required=True, help="Vector store index name")
     parser.add_argument(
+        "--marqo_url",
+        default="http://localhost:8882",
+        help="URL for the Marqo server. Required if using Marqo as embedder or vector store.",
     )
     parser.add_argument(
         "--share",
             history_langchain_format.append(HumanMessage(content=human))
             history_langchain_format.append(AIMessage(content=ai))
         history_langchain_format.append(HumanMessage(content=message))
+        response = rag_chain.invoke({"input": message, "chat_history": history_langchain_format})
         answer = append_sources_to_response(response)
         return answer

src/chunker.py CHANGED Viewed

@@ -1,12 +1,12 @@
 """Chunker abstraction and implementations."""
 import logging
-import nbformat
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
 from typing import List, Optional
 import pygments
 import tiktoken
 from semchunk import chunk as chunk_via_semchunk
@@ -31,7 +31,7 @@ class Chunk:
         return self._content
     @property
-    def to_dict(self):
         """Converts the chunk to a dictionary that can be passed to a vector store."""
         # Some vector stores require the IDs to be ASCII.
         filename_ascii = self.filename.encode("ascii", "ignore").decode("ascii")
@@ -49,9 +49,7 @@ class Chunk:
     def populate_content(self, file_content: str):
         """Populates the content of the chunk with the file path and file content."""
-        self._content = (
-            self.filename + "\n\n" + file_content[self.start_byte : self.end_byte]
-        )
     def num_tokens(self, tokenizer):
         """Counts the number of tokens in the chunk."""
@@ -115,9 +113,7 @@ class CodeChunker(Chunker):
         if not node.children:
             # This is a leaf node, but it's too long. We'll have to split it with a text tokenizer.
-            return self.text_chunker.chunk(
-                filename, file_content[node.start_byte : node.end_byte]
-            )
         chunks = []
         for child in node.children:
@@ -133,11 +129,7 @@ class CodeChunker(Chunker):
         for chunk in chunks:
             if not merged_chunks:
                 merged_chunks.append(chunk)
-            elif (
-                merged_chunks[-1].num_tokens(self.tokenizer)
-                + chunk.num_tokens(self.tokenizer)
-                < self.max_tokens - 50
-            ):
                 # There's a good chance that merging these two chunks will be under the token limit. We're not 100% sure
                 # at this point, because tokenization is not necessarily additive.
                 merged = Chunk(
@@ -203,9 +195,7 @@ class CodeChunker(Chunker):
             # a bug in the code.
             assert chunk.content
             size = chunk.num_tokens(self.tokenizer)
-            assert (
-                size <= self.max_tokens
-            ), f"Chunk size {size} exceeds max_tokens {self.max_tokens}."
         return chunks
@@ -217,17 +207,13 @@ class TextChunker(Chunker):
         self.max_tokens = max_tokens
         tokenizer = tiktoken.get_encoding("cl100k_base")
-        self.count_tokens = lambda text: len(
-            tokenizer.encode(text, disallowed_special=())
-        )
     def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
         """Chunks a text file into smaller pieces."""
         # We need to allocate some tokens for the filename, which is part of the chunk content.
         extra_tokens = self.count_tokens(file_path + "\n\n")
-        text_chunks = chunk_via_semchunk(
-            file_content, self.max_tokens - extra_tokens, self.count_tokens
-        )
         chunks = []
         start = 0
@@ -252,6 +238,7 @@ class IPYNBChunker(Chunker):
     Based on https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.ipynb
     """
     def __init__(self, code_chunker: CodeChunker):
         self.code_chunker = code_chunker

 """Chunker abstraction and implementations."""
 import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
 from typing import List, Optional
+import nbformat
 import pygments
 import tiktoken
 from semchunk import chunk as chunk_via_semchunk
         return self._content
     @property
+    def to_metadata(self):
         """Converts the chunk to a dictionary that can be passed to a vector store."""
         # Some vector stores require the IDs to be ASCII.
         filename_ascii = self.filename.encode("ascii", "ignore").decode("ascii")
     def populate_content(self, file_content: str):
         """Populates the content of the chunk with the file path and file content."""
+        self._content = self.filename + "\n\n" + file_content[self.start_byte : self.end_byte]
     def num_tokens(self, tokenizer):
         """Counts the number of tokens in the chunk."""
         if not node.children:
             # This is a leaf node, but it's too long. We'll have to split it with a text tokenizer.
+            return self.text_chunker.chunk(filename, file_content[node.start_byte : node.end_byte])
         chunks = []
         for child in node.children:
         for chunk in chunks:
             if not merged_chunks:
                 merged_chunks.append(chunk)
+            elif merged_chunks[-1].num_tokens(self.tokenizer) + chunk.num_tokens(self.tokenizer) < self.max_tokens - 50:
                 # There's a good chance that merging these two chunks will be under the token limit. We're not 100% sure
                 # at this point, because tokenization is not necessarily additive.
                 merged = Chunk(
             # a bug in the code.
             assert chunk.content
             size = chunk.num_tokens(self.tokenizer)
+            assert size <= self.max_tokens, f"Chunk size {size} exceeds max_tokens {self.max_tokens}."
         return chunks
         self.max_tokens = max_tokens
         tokenizer = tiktoken.get_encoding("cl100k_base")
+        self.count_tokens = lambda text: len(tokenizer.encode(text, disallowed_special=()))
     def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
         """Chunks a text file into smaller pieces."""
         # We need to allocate some tokens for the filename, which is part of the chunk content.
         extra_tokens = self.count_tokens(file_path + "\n\n")
+        text_chunks = chunk_via_semchunk(file_content, self.max_tokens - extra_tokens, self.count_tokens)
         chunks = []
         start = 0
     Based on https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.ipynb
     """
     def __init__(self, code_chunker: CodeChunker):
         self.code_chunker = code_chunker

src/embedder.py CHANGED Viewed

@@ -7,11 +7,11 @@ from abc import ABC, abstractmethod
 from collections import Counter
 from typing import Dict, Generator, List, Tuple
 from openai import OpenAI
 from chunker import Chunk, Chunker
 from repo_manager import RepoManager
-import marqo
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
@@ -63,7 +63,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
                     openai_batch_id = self._issue_job_for_chunks(
                         sub_batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}"
                     )
-                    self.openai_batch_ids[openai_batch_id] = [chunk.to_dict for chunk in sub_batch]
                     if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
                         logging.info("Reached the maximum number of embedding jobs. Stopping.")
                         return
@@ -72,7 +72,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
         # Finally, commit the last batch.
         if batch:
             openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}")
-            self.openai_batch_ids[openai_batch_id] = [chunk.to_dict for chunk in batch]
         logging.info("Issued %d jobs for %d chunks.", len(self.openai_batch_ids), chunk_count)
         # Save the job IDs to a file, just in case this script is terminated by mistake.
@@ -179,12 +179,7 @@ class MarqoEmbedder(BatchEmbedder):
     Embeddings can be stored locally (in which case `url` the constructor should point to localhost) or in the cloud.
     """
-    def __init__(self,
-                 repo_manager: RepoManager,
-                 chunker: Chunker,
-                 index_name: str,
-                 url: str,
-                 model="hf/e5-base-v2"):
         self.repo_manager = repo_manager
         self.chunker = chunker
         self.client = marqo.Client(url=url)
@@ -212,8 +207,8 @@ class MarqoEmbedder(BatchEmbedder):
                     sub_batch = batch[i : i + chunks_per_batch]
                     logging.info("Indexing %d chunks...", len(sub_batch))
                     self.index.add_documents(
-                        documents=[chunk.to_dict for chunk in sub_batch],
-                        tensor_fields=["text"]
                     )
                     if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
@@ -223,10 +218,7 @@ class MarqoEmbedder(BatchEmbedder):
         # Finally, commit the last batch.
         if batch:
-            self.index.add_documents(
-                documents=[chunk.to_dict for chunk in batch],
-                tensor_fields=["text"]
-            )
         logging.info(f"Successfully embedded {chunk_count} chunks.")
     def embeddings_are_ready(self) -> bool:

 from collections import Counter
 from typing import Dict, Generator, List, Tuple
+import marqo
 from openai import OpenAI
 from chunker import Chunk, Chunker
 from repo_manager import RepoManager
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
                     openai_batch_id = self._issue_job_for_chunks(
                         sub_batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}"
                     )
+                    self.openai_batch_ids[openai_batch_id] = [chunk.to_metadata for chunk in sub_batch]
                     if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
                         logging.info("Reached the maximum number of embedding jobs. Stopping.")
                         return
         # Finally, commit the last batch.
         if batch:
             openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}")
+            self.openai_batch_ids[openai_batch_id] = [chunk.to_metadata for chunk in batch]
         logging.info("Issued %d jobs for %d chunks.", len(self.openai_batch_ids), chunk_count)
         # Save the job IDs to a file, just in case this script is terminated by mistake.
     Embeddings can be stored locally (in which case `url` the constructor should point to localhost) or in the cloud.
     """
+    def __init__(self, repo_manager: RepoManager, chunker: Chunker, index_name: str, url: str, model="hf/e5-base-v2"):
         self.repo_manager = repo_manager
         self.chunker = chunker
         self.client = marqo.Client(url=url)
                     sub_batch = batch[i : i + chunks_per_batch]
                     logging.info("Indexing %d chunks...", len(sub_batch))
                     self.index.add_documents(
+                        documents=[chunk.to_metadata for chunk in sub_batch],
+                        tensor_fields=["text"],
                     )
                     if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
         # Finally, commit the last batch.
         if batch:
+            self.index.add_documents(documents=[chunk.to_metadata for chunk in batch], tensor_fields=["text"])
         logging.info(f"Successfully embedded {chunk_count} chunks.")
     def embeddings_are_ready(self) -> bool:

src/index.py CHANGED Viewed

@@ -5,19 +5,15 @@ import logging
 import time
 from chunker import UniversalChunker
-from embedder import OpenAIBatchEmbedder, MarqoEmbedder
 from repo_manager import RepoManager
 from vector_store import PineconeVectorStore
 logging.basicConfig(level=logging.INFO)
 OPENAI_EMBEDDING_SIZE = 1536
-MAX_TOKENS_PER_CHUNK = (
-    8192  # The ADA embedder from OpenAI has a maximum of 8192 tokens.
-)
-MAX_CHUNKS_PER_BATCH = (
-    2048  # The OpenAI batch embedding API enforces a maximum of 2048 chunks per batch.
-)
 MAX_TOKENS_PER_JOB = 3_000_000  # The OpenAI batch embedding API enforces a maximum of 3M tokens processed at once.
@@ -43,11 +39,12 @@ def main():
         help="https://arxiv.org/pdf/2406.14497 recommends a value between 200-800.",
     )
     parser.add_argument(
-        "--chunks_per_batch", type=int, default=2000, help="Maximum chunks per batch"
-    )
-    parser.add_argument(
-        "--index_name", required=True, help="Vector store index name"
     )
     parser.add_argument(
         "--include",
         help="Path to a file containing a list of extensions to include. One extension per line.",
@@ -58,7 +55,8 @@ def main():
         help="Path to a file containing a list of extensions to exclude. One extension per line.",
     )
     parser.add_argument(
-        "--max_embedding_jobs", type=int,
         help="Maximum number of embedding jobs to run. Specifying this might result in "
         "indexing only part of the repository, but prevents you from burning through OpenAI credits.",
     )
@@ -79,16 +77,15 @@ def main():
         parser.error("When using OpenAI embedder, the vector store type must be Pinecone.")
     if args.embedder_type == "marqo" and args.vector_store_type != "marqo":
         parser.error("When using the marqo embedder, the vector store type must also be marqo.")
     # Validate other arguments.
     if args.tokens_per_chunk > MAX_TOKENS_PER_CHUNK:
-        parser.error(
-            f"The maximum number of tokens per chunk is {MAX_TOKENS_PER_CHUNK}."
-        )
     if args.chunks_per_batch > MAX_CHUNKS_PER_BATCH:
-        parser.error(
-            f"The maximum number of chunks per batch is {MAX_CHUNKS_PER_BATCH}."
-        )
     if args.tokens_per_chunk * args.chunks_per_batch >= MAX_TOKENS_PER_JOB:
         parser.error(f"The maximum number of chunks per job is {MAX_TOKENS_PER_JOB}.")
     if args.include and args.exclude:
@@ -112,11 +109,9 @@ def main():
     if args.embedder_type == "openai":
         embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
     elif args.embedder_type == "marqo":
-        embedder = MarqoEmbedder(repo_manager,
-                                 chunker,
-                                 index_name=args.index_name,
-                                 url=args.marqo_url,
-                                 model=args.marqo_embedding_model)
     else:
         raise ValueError(f"Unrecognized embedder type {args.embedder_type}")

 import time
 from chunker import UniversalChunker
+from embedder import MarqoEmbedder, OpenAIBatchEmbedder
 from repo_manager import RepoManager
 from vector_store import PineconeVectorStore
 logging.basicConfig(level=logging.INFO)
 OPENAI_EMBEDDING_SIZE = 1536
+MAX_TOKENS_PER_CHUNK = 8192  # The ADA embedder from OpenAI has a maximum of 8192 tokens.
+MAX_CHUNKS_PER_BATCH = 2048  # The OpenAI batch embedding API enforces a maximum of 2048 chunks per batch.
 MAX_TOKENS_PER_JOB = 3_000_000  # The OpenAI batch embedding API enforces a maximum of 3M tokens processed at once.
         help="https://arxiv.org/pdf/2406.14497 recommends a value between 200-800.",
     )
     parser.add_argument(
+        "--chunks_per_batch",
+        type=int,
+        default=2000,
+        help="Maximum chunks per batch. We recommend 2000 for the OpenAI embedder. Marqo enforces a limit of 64.",
     )
+    parser.add_argument("--index_name", required=True, help="Vector store index name")
     parser.add_argument(
         "--include",
         help="Path to a file containing a list of extensions to include. One extension per line.",
         help="Path to a file containing a list of extensions to exclude. One extension per line.",
     )
     parser.add_argument(
+        "--max_embedding_jobs",
+        type=int,
         help="Maximum number of embedding jobs to run. Specifying this might result in "
         "indexing only part of the repository, but prevents you from burning through OpenAI credits.",
     )
         parser.error("When using OpenAI embedder, the vector store type must be Pinecone.")
     if args.embedder_type == "marqo" and args.vector_store_type != "marqo":
         parser.error("When using the marqo embedder, the vector store type must also be marqo.")
+    if args.embedder_type == "marqo" and args.chunks_per_batch > 64:
+        args.chunks_per_batch = 64
+        logging.warning("Marqo enforces a limit of 64 chunks per batch. Setting --chunks_per_batch to 64.")
     # Validate other arguments.
     if args.tokens_per_chunk > MAX_TOKENS_PER_CHUNK:
+        parser.error(f"The maximum number of tokens per chunk is {MAX_TOKENS_PER_CHUNK}.")
     if args.chunks_per_batch > MAX_CHUNKS_PER_BATCH:
+        parser.error(f"The maximum number of chunks per batch is {MAX_CHUNKS_PER_BATCH}.")
     if args.tokens_per_chunk * args.chunks_per_batch >= MAX_TOKENS_PER_JOB:
         parser.error(f"The maximum number of chunks per job is {MAX_TOKENS_PER_JOB}.")
     if args.include and args.exclude:
     if args.embedder_type == "openai":
         embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
     elif args.embedder_type == "marqo":
+        embedder = MarqoEmbedder(
+            repo_manager, chunker, index_name=args.index_name, url=args.marqo_url, model=args.marqo_embedding_model
+        )
     else:
         raise ValueError(f"Unrecognized embedder type {args.embedder_type}")

src/repo_manager.py CHANGED Viewed

@@ -35,9 +35,7 @@ class RepoManager:
     @cached_property
     def is_public(self) -> bool:
         """Checks whether a GitHub repository is publicly visible."""
-        response = requests.get(
-            f"https://api.github.com/repos/{self.repo_id}", timeout=10
-        )
         # Note that the response will be 404 for both private and non-existent repos.
         return response.status_code == 200
@@ -50,17 +48,13 @@ class RepoManager:
         if self.access_token:
             headers["Authorization"] = f"token {self.access_token}"
-        response = requests.get(
-            f"https://api.github.com/repos/{self.repo_id}", headers=headers
-        )
         if response.status_code == 200:
             branch = response.json().get("default_branch", "main")
         else:
             # This happens sometimes when we exceed the Github rate limit. The best bet in this case is to assume the
             # most common naming for the default branch ("main").
-            logging.warn(
-                f"Unable to fetch default branch for {self.repo_id}: {response.text}"
-            )
             branch = "main"
         return branch
@@ -81,9 +75,7 @@ class RepoManager:
         try:
             Repo.clone_from(clone_url, self.local_path, depth=1, single_branch=True)
         except GitCommandError as e:
-            logging.error(
-                "Unable to clone %s from %s. Error: %s", self.repo_id, clone_url, e
-            )
             return False
         return True
@@ -130,9 +122,7 @@ class RepoManager:
                     for path in included_file_paths:
                         f.write(path + "\n")
-                excluded_file_paths = set(file_paths).difference(
-                    set(included_file_paths)
-                )
                 with open(excluded_log_file, "a") as f:
                     for path in excluded_file_paths:
                         f.write(path + "\n")
@@ -142,15 +132,11 @@ class RepoManager:
                     try:
                         contents = f.read()
                     except UnicodeDecodeError:
-                        logging.warning(
-                            "Unable to decode file %s. Skipping.", file_path
-                        )
                         continue
                     yield file_path[len(self.local_dir) + 1 :], contents
     def github_link_for_file(self, file_path: str) -> str:
         """Converts a repository file path to a GitHub link."""
         file_path = file_path[len(self.repo_id) :]
-        return (
-            f"https://github.com/{self.repo_id}/blob/{self.default_branch}/{file_path}"
-        )

     @cached_property
     def is_public(self) -> bool:
         """Checks whether a GitHub repository is publicly visible."""
+        response = requests.get(f"https://api.github.com/repos/{self.repo_id}", timeout=10)
         # Note that the response will be 404 for both private and non-existent repos.
         return response.status_code == 200
         if self.access_token:
             headers["Authorization"] = f"token {self.access_token}"
+        response = requests.get(f"https://api.github.com/repos/{self.repo_id}", headers=headers)
         if response.status_code == 200:
             branch = response.json().get("default_branch", "main")
         else:
             # This happens sometimes when we exceed the Github rate limit. The best bet in this case is to assume the
             # most common naming for the default branch ("main").
+            logging.warn(f"Unable to fetch default branch for {self.repo_id}: {response.text}")
             branch = "main"
         return branch
         try:
             Repo.clone_from(clone_url, self.local_path, depth=1, single_branch=True)
         except GitCommandError as e:
+            logging.error("Unable to clone %s from %s. Error: %s", self.repo_id, clone_url, e)
             return False
         return True
                     for path in included_file_paths:
                         f.write(path + "\n")
+                excluded_file_paths = set(file_paths).difference(set(included_file_paths))
                 with open(excluded_log_file, "a") as f:
                     for path in excluded_file_paths:
                         f.write(path + "\n")
                     try:
                         contents = f.read()
                     except UnicodeDecodeError:
+                        logging.warning("Unable to decode file %s. Skipping.", file_path)
                         continue
                     yield file_path[len(self.local_dir) + 1 :], contents
     def github_link_for_file(self, file_path: str) -> str:
         """Converts a repository file path to a GitHub link."""
         file_path = file_path[len(self.repo_id) :]
+        return f"https://github.com/{self.repo_id}/blob/{self.default_branch}/{file_path}"

src/vector_store.py CHANGED Viewed

@@ -10,6 +10,7 @@ Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
 class VectorStore(ABC):
     """Abstract class for a vector store."""
     @abstractmethod
     def ensure_exists(self):
         """Ensures that the vector store exists. Creates it if it doesn't."""
@@ -42,13 +43,10 @@ class PineconeVectorStore(VectorStore):
     def ensure_exists(self):
         if self.index_name not in self.client.list_indexes().names():
-            self.client.create_index(
-                name=self.index_name, dimension=self.dimension, metric="cosine"
-            )
     def upsert_batch(self, vectors: List[Vector]):
         pinecone_vectors = [
-            (metadata.get("id", str(i)), embedding, metadata)
-            for i, (metadata, embedding) in enumerate(vectors)
         ]
         self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)

 class VectorStore(ABC):
     """Abstract class for a vector store."""
     @abstractmethod
     def ensure_exists(self):
         """Ensures that the vector store exists. Creates it if it doesn't."""
     def ensure_exists(self):
         if self.index_name not in self.client.list_indexes().names():
+            self.client.create_index(name=self.index_name, dimension=self.dimension, metric="cosine")
     def upsert_batch(self, vectors: List[Vector]):
         pinecone_vectors = [
+            (metadata.get("id", str(i)), embedding, metadata) for i, (metadata, embedding) in enumerate(vectors)
         ]
         self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)